source: src/main/java/weka/classifiers/lazy/LBR.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 40.0 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    aint with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 *    LBR.java
20 *    The naive Bayesian classifier provides a simple and effective approach to
21 *    classifier learning, but its attribute independence assumption is often
22 *    violated in the real world. Lazy Bayesian Rules selectively relaxes the
23 *    independence assumption, achieving lower error rates over a range of
24 *    learning tasks.  LBR defers processing to classification time, making it
25 *    a highly efficient and accurate classification algorithm when small
26 *    numbers of objects are to be classified.
27 *
28 *    For more information, see
29 <!-- technical-plaintext-start -->
30 * Zijian Zheng, G. Webb (2000). Lazy Learning of Bayesian Rules. Machine Learning. 4(1):53-84.
31 <!-- technical-plaintext-end -->
32 *
33 *    http://www.cm.deakin.edu.au/webb
34 *
35 *    Copyright (C) 2001 Deakin University
36 *    School of Computing and Mathematics
37 *    Deakin University
38 *    Geelong, Vic, 3217, Australia
39 *
40 *    Email: zhw@deakin.edu.au
41 *
42 */
43
44package weka.classifiers.lazy;
45
46import weka.classifiers.Classifier;
47import weka.classifiers.AbstractClassifier;
48import weka.core.Attribute;
49import weka.core.Capabilities;
50import weka.core.Instance;
51import weka.core.Instances;
52import weka.core.RevisionHandler;
53import weka.core.RevisionUtils;
54import weka.core.Statistics;
55import weka.core.TechnicalInformation;
56import weka.core.TechnicalInformationHandler;
57import weka.core.Utils;
58import weka.core.Capabilities.Capability;
59import weka.core.TechnicalInformation.Field;
60import weka.core.TechnicalInformation.Type;
61
62import java.io.Serializable;
63import java.util.ArrayList;
64
65/**
66 <!-- globalinfo-start -->
67 * Lazy Bayesian Rules Classifier. The naive Bayesian classifier provides a simple and effective approach to classifier learning, but its attribute independence assumption is often violated in the real world. Lazy Bayesian Rules selectively relaxes the independence assumption, achieving lower error rates over a range of learning tasks. LBR defers processing to classification time, making it a highly efficient and accurate classification algorithm when small numbers of objects are to be classified.<br/>
68 * <br/>
69 * For more information, see:<br/>
70 * <br/>
71 * Zijian Zheng, G. Webb (2000). Lazy Learning of Bayesian Rules. Machine Learning. 4(1):53-84.
72 * <p/>
73 <!-- globalinfo-end -->
74 *
75 <!-- technical-bibtex-start -->
76 * BibTeX:
77 * <pre>
78 * &#64;article{Zheng2000,
79 *    author = {Zijian Zheng and G. Webb},
80 *    journal = {Machine Learning},
81 *    number = {1},
82 *    pages = {53-84},
83 *    title = {Lazy Learning of Bayesian Rules},
84 *    volume = {4},
85 *    year = {2000}
86 * }
87 * </pre>
88 * <p/>
89 <!-- technical-bibtex-end -->
90 *
91 <!-- options-start -->
92 * Valid options are: <p/>
93 *
94 * <pre> -D
95 *  If set, classifier is run in debug mode and
96 *  may output additional info to the console</pre>
97 *
98 <!-- options-end -->
99 *
100 * @author Zhihai Wang (zhw@deakin.edu.au) : July 2001 implemented the algorithm
101 * @author Jason Wells (wells@deakin.edu.au) : November 2001 added instance referencing via indexes
102 * @version $Revision: 5987 $
103 */
104public class LBR 
105  extends AbstractClassifier
106  implements TechnicalInformationHandler {
107
108  /** for serialization */
109  static final long serialVersionUID = 5648559277738985156L;
110 
111  /**
112   * Class for handling instances and the associated attributes. <p>
113   * Enables a set of indexes to a given dataset to be created and used
114   * with an algorithm.  This reduces the memory overheads and time required
115   * when manipulating and referencing Instances and their Attributes. 
116   */
117  public class Indexes
118    implements Serializable, RevisionHandler {
119
120    /** for serialization */
121    private static final long serialVersionUID = -2771490019751421307L;
122   
123    /** the array instance indexes **/
124    public boolean [] m_InstIndexes;
125   
126    /** the array attribute indexes **/
127    public boolean [] m_AttIndexes;
128   
129    /** the number of instances indexed **/
130    private int m_NumInstances;
131   
132    /** the number of attributes indexed **/
133    private int m_NumAtts;
134   
135    /** the array of instance indexes that are set to a either true or false **/
136    public int [] m_SequentialInstIndexes;
137   
138    /** an array of attribute indexes that are set to either true or false **/
139    public int [] m_SequentialAttIndexes;
140   
141    /** flag to check if sequential array must be rebuilt due to changes to the instance index*/
142    private boolean m_SequentialInstanceIndex_valid = false;
143   
144   /** flag to check if sequential array must be rebuilt due to changes to the attribute index */
145    private boolean m_SequentialAttIndex_valid = false;
146   
147    /** the number of instances "in use"  or set to a the original value (true or false) **/
148    public int m_NumInstsSet;
149   
150    /** the number of attributes "in use"  or set to a the original value (true or false) **/
151    public int m_NumAttsSet;
152   
153    /** the number of sequential instances "in use"  or set to a the original value (true or false) **/
154    public int m_NumSeqInstsSet;
155   
156    /** the number of sequential attributes "in use"  or set to a the original value (true or false) **/
157    public int m_NumSeqAttsSet;
158   
159    /** the Class Index for the data set **/
160    public int m_ClassIndex;
161   
162    /**
163     * constructor
164     * @param numInstances the number of instances in dataset
165     * @param numAtts the number of attributes in dataset
166     * @param value either true or false
167     * @param classIndex  Set to -1 if you want class attribute switched on or the value of the instances
168     * class index will be switched of and the class attibute will not be considered.
169     */
170    public Indexes(int numInstances, int numAtts, boolean value, int classIndex) {
171      /* to create an empty DATASET with all attributes indexed use FALSE
172       * to create a index of all instances and attributes use TRUE
173       */
174      // initialise counts
175      m_NumInstsSet =  m_NumInstances = numInstances;
176      m_NumAttsSet = m_NumAtts = numAtts;
177     
178      m_InstIndexes = new boolean [(int)numInstances];
179     
180      /* set all indexes to value */
181      int i = 0;
182      while(i < numInstances) {
183        m_InstIndexes[i] = value;
184        i++;
185      }
186     
187      m_AttIndexes = new boolean [(int)numAtts];
188     
189      /* set all indexes to true */
190      i = 0;
191      while(i < numAtts) {
192        m_AttIndexes[i] = true;
193        i++;
194      }
195      // if the value is false the dataset has no instances therefore no instances are set
196      if(value == false)
197        m_NumInstsSet = 0;
198      // no sequential array has been created
199      m_SequentialInstanceIndex_valid = false;
200      m_SequentialAttIndex_valid = false;
201     
202      // switch class attr to false as the class is not used in the dataset.  Set to -1 if you want the class attr included
203      if(classIndex != -1)
204        setAttIndex(classIndex, false);
205      m_ClassIndex = classIndex;
206    }
207   
208    /**
209     * constructor
210     * @param FromIndexes the object you want to copy
211     */
212    public Indexes(Indexes FromIndexes) {
213      // set counts to the FromIndexes counts
214      m_NumInstances = FromIndexes.getNumInstances();
215      m_NumInstsSet = FromIndexes.m_NumInstsSet;
216      m_NumAtts = FromIndexes.m_NumAtts;
217      m_NumAttsSet = FromIndexes.m_NumAttsSet;
218      m_InstIndexes = new boolean [m_NumInstances];
219     
220      System.arraycopy(FromIndexes.m_InstIndexes, 0, m_InstIndexes, 0, m_NumInstances);
221     
222      m_AttIndexes = new boolean [(int)m_NumAtts];
223     
224      System.arraycopy(FromIndexes.m_AttIndexes, 0, m_AttIndexes, 0, m_NumAtts);
225      m_ClassIndex = FromIndexes.m_ClassIndex;
226      m_SequentialInstanceIndex_valid = false;
227      m_SequentialAttIndex_valid = false;
228    }
229   
230    /**
231     *
232     * Changes the boolean value at the specified index in the InstIndexes array
233     *
234     * @param index the index of the instance
235     * @param value the value to set at the specified index
236     *
237     */
238    public void setInstanceIndex(int index, boolean value) {
239      if(index < 0 || index >= m_NumInstances)
240        throw new IllegalArgumentException("Invalid Instance Index value");
241      // checks that the index isn't alreading set to value
242      if(m_InstIndexes[(int)index] != value) {
243       
244        // set the value
245        m_InstIndexes[(int)index] = value;
246       
247        // a change has been made, so sequential array is invalid
248        m_SequentialInstanceIndex_valid = false;
249       
250        // change the number of values "in use" to appropriate value
251        if(value == false)
252          m_NumInstsSet--;
253        else
254          m_NumInstsSet++;
255      }
256   }
257   
258    /**
259     *
260     * Changes the boolean value at the specified index in the InstIndexes array
261     *
262     * @param Attributes array of attributes
263     * @param value the value to set at the specified index
264     *
265     */
266    public void setAtts(int [] Attributes, boolean value) {
267      for(int i = 0; i < m_NumAtts; i++) {
268        m_AttIndexes[i] = !value;
269      }
270      for (int i = 0; i < Attributes.length; i++)  {
271        m_AttIndexes[Attributes[i]] = value;
272      }
273      m_NumAttsSet = Attributes.length;
274      m_SequentialAttIndex_valid = false;
275    }
276   
277    /**
278     *
279     * Changes the boolean value at the specified index in the InstIndexes array
280     *
281     * @param Instances array of instances
282     * @param value the value to set at the specified index
283     *
284     */
285    public void setInsts(int [] Instances, boolean value) {
286      resetInstanceIndex(!value);
287      for (int i = 0; i < Instances.length; i++)  {
288        m_InstIndexes[Instances[i]] = value;
289      }
290      m_NumInstsSet = Instances.length;
291      m_SequentialInstanceIndex_valid = false;
292    }
293   
294   
295    /**
296     *
297     * Changes the boolean value at the specified index in the AttIndexes array
298     *
299     * @param index the index of the instance
300     * @param value the value to set at the specified index
301     *
302     */
303    public void setAttIndex(int index, boolean value) {
304      if(index < 0 || index >= m_NumAtts)
305        throw new IllegalArgumentException("Invalid Attribute Index value");
306      // checks that the index isn't alreading set to value
307      if(m_AttIndexes[(int)index] != value) {
308       
309        // set the value
310        m_AttIndexes[(int)index] = value;
311       
312        // a change has been made, so sparse array is invalid
313        m_SequentialAttIndex_valid = false; 
314       
315         // change the number of values "in use" to appropriate value
316        if(value == false)
317          m_NumAttsSet--;
318        else
319          m_NumAttsSet++;
320      }
321    }
322   
323    /**
324     *
325     * Returns the boolean value at the specified index in the Instance Index array
326     *
327     * @param index the index of the instance
328     * @return the boolean value at the specified index
329     */
330    public boolean getInstanceIndex(int index) {
331     
332      if(index < 0 || index >= m_NumInstances)
333        throw new IllegalArgumentException("Invalid index value");
334     
335      return m_InstIndexes[(int)index]; 
336    }
337   
338    /**
339     *
340     * Returns the boolean value at the specified index in the Sequential Instance Indexes array
341     *
342     * @param index the index of the instance
343     * @return the requested value
344     */
345    public int getSequentialInstanceIndex(int index) {
346     
347      if(index < 0 || index >= m_NumInstances)
348        throw new IllegalArgumentException("Invalid index value");
349     
350      return m_SequentialInstIndexes[(int)index]; 
351    }
352   
353    /**
354     *
355     * Resets the boolean value in the Instance Indexes array to a specified value
356     *
357     * @param value the value to set all indexes
358     *
359    */
360    public void resetInstanceIndex(boolean value) {
361      m_NumInstsSet = m_NumInstances;
362      for(int i = 0; i < m_NumInstances; i++) {
363        m_InstIndexes[i] = value;
364      }
365      if(value == false)
366        m_NumInstsSet =  0;
367      m_SequentialInstanceIndex_valid = false;
368    }
369   
370   /**
371    *
372    * Resets the boolean values in Attribute and Instance array to reflect an empty dataset withthe same attributes set as in the incoming Indexes Object
373    *
374    * @param FromIndexes the Indexes to be copied
375    *
376    */
377    public void resetDatasetBasedOn(Indexes FromIndexes) {
378      resetInstanceIndex(false);
379      resetAttIndexTo(FromIndexes);
380    }
381   
382    /**
383     *
384     * Resets the boolean value in AttIndexes array
385     *
386     * @param value the value to set the attributes to
387     *
388     */
389    public void resetAttIndex(boolean value) {
390      m_NumAttsSet =  m_NumAtts;
391      for(int i = 0; i < m_NumAtts; i++) {
392        m_AttIndexes[i] = value;
393      }
394      if(m_ClassIndex != -1)
395        setAttIndex(m_ClassIndex, false);
396      if(value == false)
397        m_NumAttsSet =  0;
398     m_SequentialAttIndex_valid = false;
399    }
400   
401    /**
402     *
403     * Resets the boolean value in AttIndexes array based on another set of Indexes
404     *
405     * @param FromIndexes the Indexes to be copied
406     *
407    */
408    public void resetAttIndexTo(Indexes FromIndexes) {
409      System.arraycopy(FromIndexes.m_AttIndexes, 0, m_AttIndexes, 0, m_NumAtts);
410      m_NumAttsSet =  FromIndexes.getNumAttributesSet();
411      m_ClassIndex = FromIndexes.m_ClassIndex;
412      m_SequentialAttIndex_valid = false;
413    }
414   
415    /**
416     *
417     * Returns the boolean value at the specified index in the Attribute Indexes array
418     *
419     * @param index the index of the Instance
420     * @return the boolean value
421     */
422    public boolean getAttIndex(int index) {
423     
424      if(index < 0 || index >= m_NumAtts)
425         throw new IllegalArgumentException("Invalid index value");
426     
427      return m_AttIndexes[(int)index];
428    }
429   
430    /**
431     *
432     * Returns the boolean value at the specified index in the Sequential Attribute Indexes array
433     *
434     * @param index the index of the Attribute
435     * @return the requested value
436     */
437    public int getSequentialAttIndex(int index) {
438     
439      if(index < 0 || index >= m_NumAtts)
440        throw new IllegalArgumentException("Invalid index value");
441     
442      return m_SequentialAttIndexes[(int)index];
443    }
444   
445    /**
446     *
447     * Returns the number of instances "in use"
448     *
449     * @return the number of instances "in use"
450     */
451    public int getNumInstancesSet() {
452     
453      return m_NumInstsSet;
454   }
455
456    /**
457     *
458     * Returns the number of instances in the dataset
459     *
460     * @return the number of instances in the dataset
461     */
462    public int getNumInstances() {
463     
464      return m_NumInstances;
465    }
466
467    /**
468     *
469     * Returns the number of instances in the Sequential array
470     *
471     * @return the number of instances in the sequential array
472     */
473    public int getSequentialNumInstances() {
474      // will always be the number set as the sequential array is for referencing only
475      return m_NumSeqInstsSet;
476    }
477   
478    /**
479     *
480     * Returns the number of attributes in the dataset
481     *
482     * @return the number of attributes
483     */
484    public int getNumAttributes() {
485     
486      return m_NumAtts;
487    }
488   
489    /**
490     *
491     * Returns the number of attributes "in use"
492     *
493     * @return the number of attributes "in use"
494     */
495    public int getNumAttributesSet() {
496     
497      return m_NumAttsSet;
498    }
499   
500    /**
501     *
502     * Returns the number of attributes in the Sequential array
503     *
504     * @return the number of attributes in the sequentual array
505     */
506    public int getSequentialNumAttributes() {
507      // will always be the number set as the sequential array is for referencing only
508      return m_NumSeqAttsSet;
509    }
510   
511    /**
512     *
513     * Returns whether or not the Sequential Instance Index requires rebuilding due to a change
514     *
515     * @return true if the sequential instance index needs rebuilding
516     */
517    public boolean isSequentialInstanceIndexValid() {
518     
519      return m_SequentialInstanceIndex_valid;
520    }
521   
522    /**
523     *
524     * Returns whether or not the Sequential Attribute Index requires rebuilding due to a change
525     *
526     * @return true if the sequential attribute index needs rebuilding
527     */
528    public boolean isSequentialAttIndexValid() {
529     
530      return m_SequentialAttIndex_valid;
531    }
532   
533    /**
534     *
535     * Sets both the Instance and Attribute indexes to a specified value
536     *
537     * @param value the value for the Instance and Attribute indices
538     */
539    public void setSequentialDataset(boolean value) {
540      setSequentialInstanceIndex(value);
541      setSequentialAttIndex(value);
542    }
543   
544    /**
545     *
546     * A Sequential Instance index is all those Instances that are set to the specified value placed in a sequential array.
547     * Each value in the sequential array contains the Instance index within the Indexes.
548     *
549     * @param value the sequential instance index
550     */
551    public void setSequentialInstanceIndex(boolean value) {
552     
553      if(m_SequentialInstanceIndex_valid == true)
554        return;
555     
556      /* needs to be recalculated */
557      int size;
558      size = m_NumInstsSet;
559     
560      m_SequentialInstIndexes = new int [(int)size];
561     
562      int j = 0;
563      for(int i = 0; i < m_NumInstances; i++) {
564        if(m_InstIndexes[i] == value) {
565          m_SequentialInstIndexes[j] = i;
566          j++;
567        }
568      }
569     
570      m_SequentialInstanceIndex_valid = true;
571      m_NumSeqInstsSet = j;
572    }
573   
574    /**
575     *
576     * A Sequential Attribute index is all those Attributes that are set to the specified value placed in a sequential array.
577     * Each value in the sequential array contains the Attribute index within the Indexes
578     *
579     * @param value the sequential attribute index
580     */
581    public void setSequentialAttIndex(boolean value) {
582     
583      if(m_SequentialAttIndex_valid == true)
584        return;
585     
586      /* needs to be recalculated */
587      int size;
588      size = m_NumAttsSet;
589     
590      m_SequentialAttIndexes = new int [(int)size];
591     
592      int j = 0;
593      for(int i = 0; i < m_NumAtts; i++) {
594        if(m_AttIndexes[i] == value) {
595          m_SequentialAttIndexes[j] = i;
596          j++;
597         }
598      }
599     
600      m_SequentialAttIndex_valid = true;
601      m_NumSeqAttsSet = j;
602    }
603   
604    /**
605     * Returns the revision string.
606     *
607     * @return          the revision
608     */
609    public String getRevision() {
610      return RevisionUtils.extract("$Revision: 5987 $");
611    }
612  } /* end of Indexes inner-class */
613 
614
615  /** All the counts for nominal attributes. */
616  protected int [][][] m_Counts;
617  /** All the counts for nominal attributes. */
618  protected int [][][] m_tCounts;
619  /** The prior probabilities of the classes. */
620  protected int [] m_Priors;
621  /** The prior probabilities of the classes. */
622  protected int [] m_tPriors;
623 
624  /** number of attributes for the dataset ***/
625  protected int m_numAtts;
626 
627  /** number of classes for dataset ***/
628  protected int m_numClasses;
629 
630 /** number of instances in dataset ***/
631  protected int m_numInsts;
632 
633  /** The set of instances used for current training. */
634  protected Instances m_Instances = null;
635 
636  /** leave-one-out errors on the training dataset. */
637  protected int m_Errors;
638 
639  /** leave-one-out error flags on the training dataaet. */
640  protected boolean [] m_ErrorFlags;
641 
642  /** best attribute's index list. maybe as output result */
643  protected ArrayList leftHand = new ArrayList();
644 
645  /** significantly lower */
646  protected static final double SIGNLOWER = 0.05;
647 
648  /** following is defined by wangzh,
649   * the number of instances to be classified incorrectly
650   * on the subset. */
651  protected boolean [] m_subOldErrorFlags;
652 
653  /** the number of instances to be classified incorrectly
654   * besides the subset. */
655  protected int m_RemainderErrors = 0;
656 
657  /** the number of instance to be processed */
658  protected int m_Number = 0;
659 
660  /** the Number of Instances to be used in building a classifiers */
661  protected int m_NumberOfInstances = 0;
662 
663  /** for printing in n-fold cross validation */
664  protected boolean m_NCV = false;
665 
666  /** index of instances and attributes for the given dataset */
667  protected Indexes m_subInstances;
668 
669  /** index of instances and attributes for the given dataset */
670  protected Indexes tempSubInstances;
671 
672  /** probability values array */
673  protected double [] posteriorsArray;
674  protected int bestCnt;
675  protected int tempCnt;
676  protected int forCnt;
677  protected int whileCnt;
678
679  /**
680   * @return a description of the classifier suitable for
681   * displaying in the explorer/experimenter gui
682   */
683  public String globalInfo() {
684
685    return 
686        "Lazy Bayesian Rules Classifier. The naive Bayesian classifier "
687      + "provides a simple and effective approach to classifier learning, "
688      + "but its attribute independence assumption is often violated in the "
689      + "real world. Lazy Bayesian Rules selectively relaxes the independence "
690      + "assumption, achieving lower error rates over a range of learning "
691      + "tasks. LBR defers processing to classification time, making it a "
692      + "highly efficient and accurate classification algorithm when small "
693      + "numbers of objects are to be classified.\n\n"
694      + "For more information, see:\n\n"
695      + getTechnicalInformation().toString();
696  }
697
698  /**
699   * Returns an instance of a TechnicalInformation object, containing
700   * detailed information about the technical background of this class,
701   * e.g., paper reference or book this class is based on.
702   *
703   * @return the technical information about this class
704   */
705  public TechnicalInformation getTechnicalInformation() {
706    TechnicalInformation        result;
707   
708    result = new TechnicalInformation(Type.ARTICLE);
709    result.setValue(Field.AUTHOR, "Zijian Zheng and G. Webb");
710    result.setValue(Field.YEAR, "2000");
711    result.setValue(Field.TITLE, "Lazy Learning of Bayesian Rules");
712    result.setValue(Field.JOURNAL, "Machine Learning");
713    result.setValue(Field.VOLUME, "4");
714    result.setValue(Field.NUMBER, "1");
715    result.setValue(Field.PAGES, "53-84");
716   
717    return result;
718  }
719
720  /**
721   * Returns default capabilities of the classifier.
722   *
723   * @return      the capabilities of this classifier
724   */
725  public Capabilities getCapabilities() {
726    Capabilities result = super.getCapabilities();
727    result.disableAll();
728
729    // attributes
730    result.enable(Capability.NOMINAL_ATTRIBUTES);
731    result.enable(Capability.MISSING_VALUES);
732
733    // class
734    result.enable(Capability.NOMINAL_CLASS);
735    result.enable(Capability.MISSING_CLASS_VALUES);
736
737    // instances
738    result.setMinimumNumberInstances(0);
739
740    return result;
741  }
742
743  /**
744   * For lazy learning, building classifier is only to prepare their inputs
745   * until classification time.
746   *
747   * @param instances set of instances serving as training data
748   * @throws Exception if the preparation has not been generated.
749   */
750  public void buildClassifier(Instances instances) throws Exception {
751    int attIndex, i, j;
752    bestCnt = 0;
753    tempCnt = 0;
754    forCnt = 0;
755    whileCnt = 0;
756   
757    // can classifier handle the data?
758    getCapabilities().testWithFail(instances);
759
760    // remove instances with missing class
761    instances = new Instances(instances);
762    instances.deleteWithMissingClass();
763   
764    m_numAtts = instances.numAttributes();
765    m_numClasses = instances.numClasses();
766    m_numInsts = instances.numInstances();
767
768    // Reserve space
769    m_Counts = new int[m_numClasses][m_numAtts][0];
770    m_Priors = new int[m_numClasses];
771    m_tCounts = new int[m_numClasses][m_numAtts][0];
772    m_tPriors = new int[m_numClasses];
773    m_subOldErrorFlags = new boolean[m_numInsts+1];
774   
775    m_Instances = instances;
776   
777    m_subInstances = new Indexes(m_numInsts, m_numAtts, true, m_Instances.classIndex());
778    tempSubInstances = new Indexes(m_numInsts, m_numAtts, true, m_Instances.classIndex());
779   
780   
781    posteriorsArray = new double[m_numClasses];
782   
783    // prepare arrays
784    for (attIndex = 0; attIndex < m_numAtts; attIndex++) {
785      Attribute attribute = (Attribute) instances.attribute(attIndex);
786      for (j = 0; j < m_numClasses; j++) {
787        m_Counts[j][attIndex] = new int[attribute.numValues()];
788        m_tCounts[j][attIndex] = new int[attribute.numValues()];
789      }
790    }
791
792    // Compute counts and priors
793    for(i = 0; i < m_numInsts; i++) {
794      Instance instance = (Instance) instances.instance(i);
795      int classValue = (int)instance.classValue();
796      // pointer for more efficient access to counts matrix in loop
797      int [][] countsPointer = m_tCounts[classValue];
798      for(attIndex = 0; attIndex < m_numAtts; attIndex++) {
799        countsPointer[attIndex][(int)instance.value(attIndex)]++;
800      }
801      m_tPriors[classValue]++;
802    }
803   
804    // Step 2: Leave-one-out on the training data set.
805    // get m_Errors and its flags array using leave-one-out.
806    m_ErrorFlags = new boolean[m_numInsts];
807   
808    m_Errors = leaveOneOut(m_subInstances, m_tCounts, m_tPriors, m_ErrorFlags);
809
810    if (m_Number == 0) {
811      m_NumberOfInstances = m_Instances.numInstances();
812    } else {
813      System.out.println(" ");
814      System.out.println("N-Fold Cross Validation: ");
815      m_NCV = true;
816    }
817  }
818 
819  /**
820   * Calculates the class membership probabilities
821   * for the given test instance.
822   * This is the most important method for Lazy Bayesian Rule algorithm.
823   *
824   * @param testInstance the instance to be classified
825   * @return predicted class probability distribution
826   * @throws Exception if distribution can't be computed
827   */
828  public double[] distributionForInstance(Instance testInstance)
829  throws Exception {
830   
831    int inst;
832    int subAttrIndex = 0;
833    int subInstIndex = 0;
834    int tempInstIndex = 0;
835    int attributeBest;
836    int subLocalErrors = 0;
837    int tempErrorsBest = 0;
838    boolean [] tempErrorFlagBest = null;
839    int [] tempD_subsetBestInsts = null;
840    int [] tempD_subsetBestAtts = null;
841    Indexes subInstances = new Indexes(m_numInsts, m_numAtts, true, m_Instances.classIndex());
842   
843    boolean [] subLocalErrorFlags = new  boolean [(int)subInstances.getNumInstances()+1];
844    // Step 2': Get localErrors, localErrorFlags, and training data set.
845    int localErrors = m_Errors;
846    boolean [] localErrorFlags = (boolean []) m_ErrorFlags.clone();
847   
848    // The number of errors on New, Not on Old in the subset.
849    int errorsNewNotOld = 0;
850    // The number of errors on Old, Not on New in the subset.
851    int errorsOldNotNew = 0;
852   
853    // Step 3:
854    leftHand.clear();
855
856    // Step 4: Beginning Repeat.
857    // Selecting all the attributes that can be moved to the lefthand.
858    while (localErrors >= 5) {
859      attributeBest = -1;
860      whileCnt++;
861      // Step 5:
862      tempErrorsBest = subInstances.getNumInstancesSet() + 1;
863      subInstances.setSequentialDataset(true);
864      // Step 6: selecting an attribute.
865      for (int attr = 0; attr < subInstances.m_NumSeqAttsSet; attr++){
866        forCnt++;
867        subAttrIndex = subInstances.m_SequentialAttIndexes[attr];
868        // Step 7: get the corresponding subset.
869       
870        m_RemainderErrors = 0;
871
872        // reset array to true
873        for(int i = 0; i < m_numInsts; i++) {
874          m_subOldErrorFlags[i] = true;
875        }
876        // reset indexes to reflect an empty dataset but with the same attrs as another dataset
877        tempSubInstances.resetDatasetBasedOn(subInstances);
878        // Get subset of the instances and its m_LastSecondErrors
879        for(inst = 0; inst < subInstances.m_NumSeqInstsSet; inst++) {
880          subInstIndex = subInstances.m_SequentialInstIndexes[inst];
881          if (m_Instances.instance(subInstIndex).value(subAttrIndex) == testInstance.value(subAttrIndex))  {
882            // add instance to subset list
883            tempSubInstances.setInstanceIndex(subInstIndex, true);
884            if (localErrorFlags[subInstIndex] == false ) {
885              m_subOldErrorFlags[subInstIndex] = false;
886            }
887          }
888          else  {
889            if (localErrorFlags[subInstIndex] == false ) {
890              m_RemainderErrors++;
891            }
892          }
893        } // end of for
894
895        // Step 7':
896        if (tempSubInstances.m_NumInstsSet < subInstances.m_NumInstsSet) {
897          // remove attribute from index
898          tempSubInstances.setAttIndex(subAttrIndex, false);
899          // Step 9: create a classifier on the subset.
900          // Compute counts and priors
901          // create sequential index of instances and attributes that are to be considered
902               
903          localNaiveBayes(tempSubInstances);
904         
905          subLocalErrors = leaveOneOut(tempSubInstances, m_Counts, m_Priors, subLocalErrorFlags);
906
907          errorsNewNotOld = 0;
908          errorsOldNotNew = 0;
909         
910          tempSubInstances.setSequentialDataset(true);
911         
912          for(int t_inst = 0; t_inst < tempSubInstances.m_NumSeqInstsSet; t_inst++) {
913            tempInstIndex = tempSubInstances.m_SequentialInstIndexes[t_inst];
914            if (subLocalErrorFlags[tempInstIndex] == false) {
915              // The number of errors on New, Not on Old in the subset.
916              if (m_subOldErrorFlags[tempInstIndex] == true) {
917                errorsNewNotOld ++;
918              }
919            } else {
920              // The number of errors on Old, Not on New in the subset.
921              if(m_subOldErrorFlags[tempInstIndex] == false) {
922                errorsOldNotNew ++;
923              }
924            }
925          } //end of for
926         
927          // Step 10 and Step 11:
928          int tempErrors = subLocalErrors + m_RemainderErrors;
929          // Step 12:
930          // Step 13: stopping criteria.
931          if((tempErrors < tempErrorsBest) && (binomP(errorsNewNotOld, errorsNewNotOld + errorsOldNotNew, 0.5 ) < SIGNLOWER))      {
932            // Step 14:
933            tempCnt++;
934            // --------------------------------------------------
935            //tempD_subsetBest = new Indexes(tempSubInstances);
936           
937            // -------------------------------------------------------------------------------
938            tempSubInstances.setSequentialDataset(true);
939            tempD_subsetBestInsts = (int []) tempSubInstances.m_SequentialInstIndexes.clone();
940            tempD_subsetBestAtts = (int []) tempSubInstances.m_SequentialAttIndexes.clone();
941            // -------------------------------------------------------------------------------
942            // Step 15:
943            tempErrorsBest = tempErrors;
944
945            tempErrorFlagBest = (boolean []) subLocalErrorFlags.clone();
946
947            // Step 16:
948            attributeBest = subAttrIndex;
949          } // end of if
950        } // end of if
951      } // end of main for
952     
953      // Step 20:
954      if(attributeBest != -1)  {
955        bestCnt++;
956        // Step 21:
957        leftHand.add(testInstance.attribute(attributeBest));
958        // ------------------------------------------------
959        // Step 22:
960        //tempD_subsetBest.setAttIndex(attributeBest, false);
961        //subInstances = tempD_subsetBest;
962        // ------------------------------------------------
963        subInstances.setInsts(tempD_subsetBestInsts, true);
964        subInstances.setAtts(tempD_subsetBestAtts, true);
965        subInstances.setAttIndex(attributeBest, false);
966        // -------------------------------------------------
967        // Step 25:
968        localErrors = tempErrorsBest;
969        localErrorFlags =  tempErrorFlagBest;
970       
971      } else {
972        break;
973      }
974    } // end of while
975   
976    // Step 27:
977    localNaiveBayes(subInstances);
978    return localDistributionForInstance(testInstance, subInstances);
979  }
980 
981  /**
982   * Returns a description of the classifier.
983   *
984   * @return a description of the classifier as a string.
985   */
986  public String toString() {
987   
988    if (m_Instances == null) {
989      return "Lazy Bayesian Rule: No model built yet.";
990    }
991   
992    try {
993      StringBuffer text = new StringBuffer
994      ("=== LBR Run information ===\n\n");
995     
996      text.append("Scheme:       weka.classifiers.LBR\n");
997     
998      text.append("Relation:     "
999      + m_Instances.attribute(m_Instances.classIndex()).name()
1000      + "\n");
1001     
1002      text.append("Instances:    "+m_Instances.numInstances()+"\n");
1003     
1004      text.append("Attributes:   "+m_Instances.numAttributes()+"\n");
1005     
1006      // Remains are printed by Evaulation.java
1007      return text.toString();
1008    } catch (Exception e) {
1009      e.printStackTrace();
1010      return "Can't Print Lazy Bayes Rule Classifier!";
1011    }
1012  }
1013
1014  /**
1015   * Leave-one-out strategy. For a given sample data set with n instances,
1016   * using (n - 1) instances by leaving one out and tested on the single
1017   * remaining case.
1018   * This is repeated n times in turn.
1019   * The final "Error" is the sum of the instances to be classified
1020   * incorrectly.
1021   *
1022   * @param instanceIndex set of instances serving as training data.
1023   * @param counts serving as all the counts of training data.
1024   * @param priors serving as the number of instances in each class.
1025   * @param errorFlags for the errors
1026   *
1027   * @return error flag array about each instance.
1028   * @throws Exception if something goes wrong
1029   **/
1030  public int leaveOneOut(Indexes instanceIndex, int [][][] counts, int [] priors, boolean [] errorFlags)  throws Exception {
1031   
1032   
1033    // ###### START LEAVE ONE OUT #############
1034    int tempClassValue;
1035    double posteriors;
1036    double sumForPriors;
1037    double sumForCounts;
1038    double max = 0;
1039    int maxIndex = 0;
1040    int AIndex, attIndex, clss;
1041    int inst;
1042    int errors = 0;
1043    int instIndex;
1044   
1045    instanceIndex.setSequentialDataset(true);
1046    int tempInstanceClassValue;
1047    int [] tempAttributeValues = new int[(int)instanceIndex.m_NumSeqAttsSet+1];
1048    Instance tempInstance;
1049    for(inst = 0; inst < instanceIndex.m_NumSeqInstsSet; inst++) {
1050      instIndex = instanceIndex.m_SequentialInstIndexes[inst];
1051      //get the leave-one-out instance
1052      tempInstance = (Instance) m_Instances.instance(instIndex);
1053      if (!tempInstance.classIsMissing()) {
1054      tempInstanceClassValue = (int)tempInstance.classValue();
1055      // pointer to first index of counts matrix for efficiency
1056      int [][] countsPointer = counts[tempInstanceClassValue];
1057      // Compute the counts and priors for (n-1) instances.
1058      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
1059        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
1060        tempAttributeValues[attIndex] = (int)tempInstance.value(AIndex);
1061        countsPointer[AIndex][tempAttributeValues[attIndex]]--;
1062      }
1063     
1064      priors[tempInstanceClassValue]--;
1065      max = 0;
1066      maxIndex= 0;
1067      // ###### LOCAL CLASSIFY INSTANCE ###########
1068      sumForPriors = Utils.sum(priors);
1069      for (clss = 0; clss < m_numClasses; clss++) {
1070        posteriors = 0.0;
1071        posteriors = (priors[clss] + 1) / (sumForPriors + m_numClasses);
1072       
1073        countsPointer = counts[clss];
1074        for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
1075          AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
1076          if (!tempInstance.isMissing(AIndex)) {
1077            sumForCounts = Utils.sum(countsPointer[AIndex]);
1078            posteriors *= ((countsPointer[AIndex][tempAttributeValues[attIndex]] + 1) / (sumForCounts + (double)tempInstance.attribute(AIndex).numValues()));
1079          }
1080        }
1081       
1082        if (posteriors > max) {
1083          maxIndex = clss;
1084          max = posteriors;
1085        }
1086      } // end of for
1087     
1088      if (max > 0) {
1089        tempClassValue = maxIndex;
1090      } else {
1091        tempClassValue = (int)Utils.missingValue();
1092      }
1093      // ###### END LOCAL CLASSIFY INSTANCE ###########
1094     
1095      // Adjudge error. Here using classIndex is incorrect,
1096      // it is index of the class attribute.
1097      if(tempClassValue == tempInstanceClassValue){
1098        errorFlags[instIndex] = true;
1099      } else {
1100        errorFlags[instIndex] = false;
1101        errors++;
1102      }
1103     
1104      countsPointer = counts[tempInstanceClassValue];
1105      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
1106        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
1107        counts[tempInstanceClassValue][AIndex][tempAttributeValues[attIndex]]++;
1108      }
1109     
1110      priors[tempInstanceClassValue]++;
1111      }
1112    } // end of for
1113    // ###### END LEAVE ONE OUT #############
1114    return errors;
1115  }
1116 
1117  /**
1118   * Class for building and using a simple Naive Bayes classifier.
1119   * For more information, see<p>
1120   *
1121   * Richard Duda and Peter Hart (1973).<i>Pattern
1122   * Classification and Scene Analysis</i>. Wiley, New York.
1123   *
1124   * This method only get m_Counts and m_Priors.
1125   *
1126   * @param instanceIndex set of instances serving as training data
1127   * @throws Exception if m_Counts and m_Priors have not been
1128   *  generated successfully
1129   */
1130  public void localNaiveBayes(Indexes instanceIndex) throws Exception {
1131    int attIndex = 0;
1132    int i, AIndex;
1133    int attVal = 0;
1134    int classVal = 0;
1135    Instance instance;
1136
1137    instanceIndex.setSequentialDataset(true);
1138
1139    // reset local counts
1140    for(classVal = 0; classVal < m_numClasses; classVal++) {
1141      // counts pointer mcTimesaver
1142      int [][] countsPointer1 = m_Counts[classVal];
1143      for(attIndex = 0; attIndex < m_numAtts; attIndex++) {
1144        Attribute attribute = m_Instances.attribute(attIndex);
1145         // love those pointers for saving time
1146         int [] countsPointer2 = countsPointer1[attIndex];
1147        for(attVal = 0; attVal < attribute.numValues(); attVal++)  {
1148          countsPointer2[attVal] = 0;
1149        }
1150     }
1151     m_Priors[classVal] = 0;
1152   }
1153
1154    for(i = 0; i < instanceIndex.m_NumSeqInstsSet; i++) {
1155      instance = (Instance) m_Instances.instance(instanceIndex.m_SequentialInstIndexes[i]);
1156      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
1157        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
1158        m_Counts[(int)instance.classValue()][AIndex][(int)instance.value(AIndex)]++;
1159      }
1160      m_Priors[(int)instance.classValue()]++;
1161    }
1162  }
1163   
1164  /**
1165   * Calculates the class membership probabilities.
1166   * for the given test instance.
1167   *
1168   * @param instance the instance to be classified
1169   * @param instanceIndex
1170   *
1171   * @return predicted class probability distribution
1172   * @throws Exception if distribution can't be computed
1173   */
1174  public double[] localDistributionForInstance(Instance instance, Indexes instanceIndex) throws Exception {
1175   
1176    double sumForPriors = 0;
1177    double sumForCounts = 0;
1178    int attIndex, AIndex;
1179    int numClassesOfInstance = instance.numClasses();
1180   
1181    sumForPriors = 0;
1182    sumForCounts = 0;
1183    instanceIndex.setSequentialDataset(true);
1184    // Calculate all of conditional probabilities.
1185    sumForPriors = Utils.sum(m_Priors) + numClassesOfInstance;
1186    for (int j = 0; j < numClassesOfInstance; j++) {
1187      // pointer to counts to make access more efficient in loop
1188      int [][] countsPointer = m_Counts[j];
1189      posteriorsArray[j] = (m_Priors[j] + 1) / (sumForPriors);
1190      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
1191        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
1192        sumForCounts = Utils.sum(countsPointer[AIndex]);
1193        if (!instance.isMissing(AIndex)) {
1194          posteriorsArray[j] *= ((countsPointer[AIndex][(int)instance.value(AIndex)] + 1) / (sumForCounts + (double)instance.attribute(AIndex).numValues()));
1195        }
1196      }
1197    }
1198   
1199    // Normalize probabilities
1200    Utils.normalize(posteriorsArray);
1201   
1202    return posteriorsArray;
1203  }
1204 
1205  /**
1206   * Significance test
1207   * binomp:
1208   *
1209   * @param r
1210   * @param n
1211   * @param p
1212   * @return returns the probability of obtaining r or fewer out of n
1213   * if the probability of an event is p.
1214   * @throws Exception if computation fails
1215   */
1216  public double binomP(double r, double n, double p) throws Exception {
1217   
1218    if (n == r) return 1.0;
1219    return Statistics.incompleteBeta(n-r, r+1.0, 1.0-p);
1220  }
1221 
1222  /**
1223   * Returns the revision string.
1224   *
1225   * @return            the revision
1226   */
1227  public String getRevision() {
1228    return RevisionUtils.extract("$Revision: 5987 $");
1229  }
1230 
1231  /**
1232   * Main method for testing this class.
1233   *
1234   * @param argv the options
1235   */
1236  public static void main(String [] argv) {
1237    runClassifier(new LBR(), argv);
1238  }
1239}
Note: See TracBrowser for help on using the repository browser.