Context Navigation

source: src/main/java/weka/classifiers/bayes/NaiveBayesMultinomial.java @ 18

Last change on this file since 18 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 12.2 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* NaiveBayesMultinomial.java
19	* Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
20	*/
21
22	package weka.classifiers.bayes;
23
24	import weka.classifiers.Classifier;
25	import weka.classifiers.AbstractClassifier;
26	import weka.core.Capabilities;
27	import weka.core.Instance;
28	import weka.core.Instances;
29	import weka.core.RevisionUtils;
30	import weka.core.TechnicalInformation;
31	import weka.core.TechnicalInformationHandler;
32	import weka.core.Utils;
33	import weka.core.WeightedInstancesHandler;
34	import weka.core.Capabilities.Capability;
35	import weka.core.TechnicalInformation.Field;
36	import weka.core.TechnicalInformation.Type;
37
38	/**
39	<!-- globalinfo-start -->
40	* Class for building and using a multinomial Naive Bayes classifier. For more information see,<br/>
41	* <br/>
42	* Andrew Mccallum, Kamal Nigam: A Comparison of Event Models for Naive Bayes Text Classification. In: AAAI-98 Workshop on 'Learning for Text Categorization', 1998.<br/>
43	* <br/>
44	* The core equation for this classifier:<br/>
45	* <br/>
46	* P[Ci\|D] = (P[D\|Ci] x P[Ci]) / P[D] (Bayes rule)<br/>
47	* <br/>
48	* where Ci is class i and D is a document.
49	* <p/>
50	<!-- globalinfo-end -->
51	*
52	<!-- technical-bibtex-start -->
53	* BibTeX:
54	* <pre>
55	* @inproceedings{Mccallum1998,
56	* author = {Andrew Mccallum and Kamal Nigam},
57	* booktitle = {AAAI-98 Workshop on 'Learning for Text Categorization'},
58	* title = {A Comparison of Event Models for Naive Bayes Text Classification},
59	* year = {1998}
60	* }
61	* </pre>
62	* <p/>
63	<!-- technical-bibtex-end -->
64	*
65	<!-- options-start -->
66	* Valid options are: <p/>
67	*
68	* <pre> -D
69	* If set, classifier is run in debug mode and
70	* may output additional info to the console</pre>
71	*
72	<!-- options-end -->
73	*
74	* @author Andrew Golightly (acg4@cs.waikato.ac.nz)
75	* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
76	* @version $Revision: 5928 $
77	*/
78	public class NaiveBayesMultinomial
79	extends AbstractClassifier
80	implements WeightedInstancesHandler,TechnicalInformationHandler {
81
82	/** for serialization */
83	static final long serialVersionUID = 5932177440181257085L;
84
85	/**
86	* probability that a word (w) exists in a class (H) (i.e. Pr[w\|H])
87	* The matrix is in the this format: probOfWordGivenClass[class][wordAttribute]
88	* NOTE: the values are actually the log of Pr[w\|H]
89	*/
90	protected double[][] m_probOfWordGivenClass;
91
92	/** the probability of a class (i.e. Pr[H]) */
93	protected double[] m_probOfClass;
94
95	/** number of unique words */
96	protected int m_numAttributes;
97
98	/** number of class values */
99	protected int m_numClasses;
100
101	/** cache lnFactorial computations */
102	protected double[] m_lnFactorialCache = new double[]{0.0,0.0};
103
104	/** copy of header information for use in toString method */
105	protected Instances m_headerInfo;
106
107	/**
108	* Returns a string describing this classifier
109	* @return a description of the classifier suitable for
110	* displaying in the explorer/experimenter gui
111	*/
112	public String globalInfo() {
113	return
114	"Class for building and using a multinomial Naive Bayes classifier. "
115	+ "For more information see,\n\n"
116	+ getTechnicalInformation().toString() + "\n\n"
117	+ "The core equation for this classifier:\n\n"
118	+ "P[Ci\|D] = (P[D\|Ci] x P[Ci]) / P[D] (Bayes rule)\n\n"
119	+ "where Ci is class i and D is a document.";
120	}
121
122	/**
123	* Returns an instance of a TechnicalInformation object, containing
124	* detailed information about the technical background of this class,
125	* e.g., paper reference or book this class is based on.
126	*
127	* @return the technical information about this class
128	*/
129	public TechnicalInformation getTechnicalInformation() {
130	TechnicalInformation result;
131
132	result = new TechnicalInformation(Type.INPROCEEDINGS);
133	result.setValue(Field.AUTHOR, "Andrew Mccallum and Kamal Nigam");
134	result.setValue(Field.YEAR, "1998");
135	result.setValue(Field.TITLE, "A Comparison of Event Models for Naive Bayes Text Classification");
136	result.setValue(Field.BOOKTITLE, "AAAI-98 Workshop on 'Learning for Text Categorization'");
137
138	return result;
139	}
140
141	/**
142	* Returns default capabilities of the classifier.
143	*
144	* @return the capabilities of this classifier
145	*/
146	public Capabilities getCapabilities() {
147	Capabilities result = super.getCapabilities();
148	result.disableAll();
149
150	// attributes
151	result.enable(Capability.NUMERIC_ATTRIBUTES);
152
153	// class
154	result.enable(Capability.NOMINAL_CLASS);
155	result.enable(Capability.MISSING_CLASS_VALUES);
156
157	return result;
158	}
159
160	/**
161	* Generates the classifier.
162	*
163	* @param instances set of instances serving as training data
164	* @throws Exception if the classifier has not been generated successfully
165	*/
166	public void buildClassifier(Instances instances) throws Exception
167	{
168	// can classifier handle the data?
169	getCapabilities().testWithFail(instances);
170
171	// remove instances with missing class
172	instances = new Instances(instances);
173	instances.deleteWithMissingClass();
174
175	m_headerInfo = new Instances(instances, 0);
176	m_numClasses = instances.numClasses();
177	m_numAttributes = instances.numAttributes();
178	m_probOfWordGivenClass = new double[m_numClasses][];
179
180	/*
181	initialising the matrix of word counts
182	NOTE: Laplace estimator introduced in case a word that does not appear for a class in the
183	training set does so for the test set
184	*/
185	for(int c = 0; c<m_numClasses; c++)
186	{
187	m_probOfWordGivenClass[c] = new double[m_numAttributes];
188	for(int att = 0; att<m_numAttributes; att++)
189	{
190	m_probOfWordGivenClass[c][att] = 1;
191	}
192	}
193
194	//enumerate through the instances
195	Instance instance;
196	int classIndex;
197	double numOccurences;
198	double[] docsPerClass = new double[m_numClasses];
199	double[] wordsPerClass = new double[m_numClasses];
200
201	java.util.Enumeration enumInsts = instances.enumerateInstances();
202	while (enumInsts.hasMoreElements())
203	{
204	instance = (Instance) enumInsts.nextElement();
205	classIndex = (int)instance.value(instance.classIndex());
206	docsPerClass[classIndex] += instance.weight();
207
208	for(int a = 0; a<instance.numValues(); a++)
209	if(instance.index(a) != instance.classIndex())
210	{
211	if(!instance.isMissing(a))
212	{
213	numOccurences = instance.valueSparse(a) * instance.weight();
214	if(numOccurences < 0)
215	throw new Exception("Numeric attribute values must all be greater or equal to zero.");
216	wordsPerClass[classIndex] += numOccurences;
217	m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences;
218	}
219	}
220	}
221
222	/*
223	normalising probOfWordGivenClass values
224	and saving each value as the log of each value
225	*/
226	for(int c = 0; c<m_numClasses; c++)
227	for(int v = 0; v<m_numAttributes; v++)
228	m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1));
229
230	/*
231	calculating Pr(H)
232	NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of
233	training instances
234	*/
235	final double numDocs = instances.sumOfWeights() + m_numClasses;
236	m_probOfClass = new double[m_numClasses];
237	for(int h=0; h<m_numClasses; h++)
238	m_probOfClass[h] = (double)(docsPerClass[h] + 1)/numDocs;
239	}
240
241	/**
242	* Calculates the class membership probabilities for the given test
243	* instance.
244	*
245	* @param instance the instance to be classified
246	* @return predicted class probability distribution
247	* @throws Exception if there is a problem generating the prediction
248	*/
249	public double [] distributionForInstance(Instance instance) throws Exception
250	{
251	double[] probOfClassGivenDoc = new double[m_numClasses];
252
253	//calculate the array of log(Pr[D\|C])
254	double[] logDocGivenClass = new double[m_numClasses];
255	for(int h = 0; h<m_numClasses; h++)
256	logDocGivenClass[h] = probOfDocGivenClass(instance, h);
257
258	double max = logDocGivenClass[Utils.maxIndex(logDocGivenClass)];
259	double probOfDoc = 0.0;
260
261	for(int i = 0; i<m_numClasses; i++)
262	{
263	probOfClassGivenDoc[i] = Math.exp(logDocGivenClass[i] - max) * m_probOfClass[i];
264	probOfDoc += probOfClassGivenDoc[i];
265	}
266
267	Utils.normalize(probOfClassGivenDoc,probOfDoc);
268
269	return probOfClassGivenDoc;
270	}
271
272	/**
273	* log(N!) + (for all the words)(log(Pi^ni) - log(ni!))
274	*
275	* where
276	* N is the total number of words
277	* Pi is the probability of obtaining word i
278	* ni is the number of times the word at index i occurs in the document
279	*
280	* @param inst The instance to be classified
281	* @param classIndex The index of the class we are calculating the probability with respect to
282	*
283	* @return The log of the probability of the document occuring given the class
284	*/
285
286	private double probOfDocGivenClass(Instance inst, int classIndex)
287	{
288	double answer = 0;
289	//double totalWords = 0; //no need as we are not calculating the factorial at all.
290
291	double freqOfWordInDoc; //should be double
292	for(int i = 0; i<inst.numValues(); i++)
293	if(inst.index(i) != inst.classIndex())
294	{
295	freqOfWordInDoc = inst.valueSparse(i);
296	//totalWords += freqOfWordInDoc;
297	answer += (freqOfWordInDoc * m_probOfWordGivenClass[classIndex][inst.index(i)]
298	); //- lnFactorial(freqOfWordInDoc));
299	}
300
301	//answer += lnFactorial(totalWords);//The factorial terms don't make
302	//any difference to the classifier's
303	//accuracy, so not needed.
304
305	return answer;
306	}
307
308	/**
309	* Fast computation of ln(n!) for non-negative ints
310	*
311	* negative ints are passed on to the general gamma-function
312	* based version in weka.core.SpecialFunctions
313	*
314	* if the current n value is higher than any previous one,
315	* the cache is extended and filled to cover it
316	*
317	* the common case is reduced to a simple array lookup
318	*
319	* @param n the integer
320	* @return ln(n!)
321	*/
322
323	public double lnFactorial(int n)
324	{
325	if (n < 0) return weka.core.SpecialFunctions.lnFactorial(n);
326
327	if (m_lnFactorialCache.length <= n) {
328	double[] tmp = new double[n+1];
329	System.arraycopy(m_lnFactorialCache,0,tmp,0,m_lnFactorialCache.length);
330	for(int i = m_lnFactorialCache.length; i < tmp.length; i++)
331	tmp[i] = tmp[i-1] + Math.log(i);
332	m_lnFactorialCache = tmp;
333	}
334
335	return m_lnFactorialCache[n];
336	}
337
338	/**
339	* Returns a string representation of the classifier.
340	*
341	* @return a string representation of the classifier
342	*/
343	public String toString()
344	{
345	StringBuffer result = new StringBuffer("The independent probability of a class\n--------------------------------------\n");
346
347	for(int c = 0; c<m_numClasses; c++)
348	result.append(m_headerInfo.classAttribute().value(c)).append("\t").append(Double.toString(m_probOfClass[c])).append("\n");
349
350	result.append("\nThe probability of a word given the class\n-----------------------------------------\n\t");
351
352	for(int c = 0; c<m_numClasses; c++)
353	result.append(m_headerInfo.classAttribute().value(c)).append("\t");
354
355	result.append("\n");
356
357	for(int w = 0; w<m_numAttributes; w++)
358	{
359	result.append(m_headerInfo.attribute(w).name()).append("\t");
360	for(int c = 0; c<m_numClasses; c++)
361	result.append(Double.toString(Math.exp(m_probOfWordGivenClass[c][w]))).append("\t");
362	result.append("\n");
363	}
364
365	return result.toString();
366	}
367
368	/**
369	* Returns the revision string.
370	*
371	* @return the revision
372	*/
373	public String getRevision() {
374	return RevisionUtils.extract("$Revision: 5928 $");
375	}
376
377	/**
378	* Main method for testing this class.
379	*
380	* @param argv the options
381	*/
382	public static void main(String [] argv) {
383	runClassifier(new NaiveBayesMultinomial(), argv);
384	}
385	}
386

Note: See TracBrowser for help on using the repository browser.

Download in other formats: