Context Navigation

NaiveBayesMultinomial.java

Last change on this file was 29, checked in by gnappo, 14 years ago
Taggata versione per la demo e aggiunto branch.
File size: 12.2 KB

Rev	Line
[29]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* NaiveBayesMultinomial.java
	19	* Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
	20	*/
	21
	22	package weka.classifiers.bayes;
	23
	24	import weka.classifiers.Classifier;
	25	import weka.classifiers.AbstractClassifier;
	26	import weka.core.Capabilities;
	27	import weka.core.Instance;
	28	import weka.core.Instances;
	29	import weka.core.RevisionUtils;
	30	import weka.core.TechnicalInformation;
	31	import weka.core.TechnicalInformationHandler;
	32	import weka.core.Utils;
	33	import weka.core.WeightedInstancesHandler;
	34	import weka.core.Capabilities.Capability;
	35	import weka.core.TechnicalInformation.Field;
	36	import weka.core.TechnicalInformation.Type;
	37
	38	/**
	39	<!-- globalinfo-start -->
	40	* Class for building and using a multinomial Naive Bayes classifier. For more information see,<br/>
	41	* <br/>
	42	* Andrew Mccallum, Kamal Nigam: A Comparison of Event Models for Naive Bayes Text Classification. In: AAAI-98 Workshop on 'Learning for Text Categorization', 1998.<br/>
	43	* <br/>
	44	* The core equation for this classifier:<br/>
	45	* <br/>
	46	* P[Ci\|D] = (P[D\|Ci] x P[Ci]) / P[D] (Bayes rule)<br/>
	47	* <br/>
	48	* where Ci is class i and D is a document.
	49	* <p/>
	50	<!-- globalinfo-end -->
	51	*
	52	<!-- technical-bibtex-start -->
	53	* BibTeX:
	54	* <pre>
	55	* @inproceedings{Mccallum1998,
	56	* author = {Andrew Mccallum and Kamal Nigam},
	57	* booktitle = {AAAI-98 Workshop on 'Learning for Text Categorization'},
	58	* title = {A Comparison of Event Models for Naive Bayes Text Classification},
	59	* year = {1998}
	60	* }
	61	* </pre>
	62	* <p/>
	63	<!-- technical-bibtex-end -->
	64	*
	65	<!-- options-start -->
	66	* Valid options are: <p/>
	67	*
	68	* <pre> -D
	69	* If set, classifier is run in debug mode and
	70	* may output additional info to the console</pre>
	71	*
	72	<!-- options-end -->
	73	*
	74	* @author Andrew Golightly (acg4@cs.waikato.ac.nz)
	75	* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
	76	* @version $Revision: 5928 $
	77	*/
	78	public class NaiveBayesMultinomial
	79	extends AbstractClassifier
	80	implements WeightedInstancesHandler,TechnicalInformationHandler {
	81
	82	/** for serialization */
	83	static final long serialVersionUID = 5932177440181257085L;
	84
	85	/**
	86	* probability that a word (w) exists in a class (H) (i.e. Pr[w\|H])
	87	* The matrix is in the this format: probOfWordGivenClass[class][wordAttribute]
	88	* NOTE: the values are actually the log of Pr[w\|H]
	89	*/
	90	protected double[][] m_probOfWordGivenClass;
	91
	92	/** the probability of a class (i.e. Pr[H]) */
	93	protected double[] m_probOfClass;
	94
	95	/** number of unique words */
	96	protected int m_numAttributes;
	97
	98	/** number of class values */
	99	protected int m_numClasses;
	100
	101	/** cache lnFactorial computations */
	102	protected double[] m_lnFactorialCache = new double[]{0.0,0.0};
	103
	104	/** copy of header information for use in toString method */
	105	protected Instances m_headerInfo;
	106
	107	/**
	108	* Returns a string describing this classifier
	109	* @return a description of the classifier suitable for
	110	* displaying in the explorer/experimenter gui
	111	*/
	112	public String globalInfo() {
	113	return
	114	"Class for building and using a multinomial Naive Bayes classifier. "
	115	+ "For more information see,\n\n"
	116	+ getTechnicalInformation().toString() + "\n\n"
	117	+ "The core equation for this classifier:\n\n"
	118	+ "P[Ci\|D] = (P[D\|Ci] x P[Ci]) / P[D] (Bayes rule)\n\n"
	119	+ "where Ci is class i and D is a document.";
	120	}
	121
	122	/**
	123	* Returns an instance of a TechnicalInformation object, containing
	124	* detailed information about the technical background of this class,
	125	* e.g., paper reference or book this class is based on.
	126	*
	127	* @return the technical information about this class
	128	*/
	129	public TechnicalInformation getTechnicalInformation() {
	130	TechnicalInformation result;
	131
	132	result = new TechnicalInformation(Type.INPROCEEDINGS);
	133	result.setValue(Field.AUTHOR, "Andrew Mccallum and Kamal Nigam");
	134	result.setValue(Field.YEAR, "1998");
	135	result.setValue(Field.TITLE, "A Comparison of Event Models for Naive Bayes Text Classification");
	136	result.setValue(Field.BOOKTITLE, "AAAI-98 Workshop on 'Learning for Text Categorization'");
	137
	138	return result;
	139	}
	140
	141	/**
	142	* Returns default capabilities of the classifier.
	143	*
	144	* @return the capabilities of this classifier
	145	*/
	146	public Capabilities getCapabilities() {
	147	Capabilities result = super.getCapabilities();
	148	result.disableAll();
	149
	150	// attributes
	151	result.enable(Capability.NUMERIC_ATTRIBUTES);
	152
	153	// class
	154	result.enable(Capability.NOMINAL_CLASS);
	155	result.enable(Capability.MISSING_CLASS_VALUES);
	156
	157	return result;
	158	}
	159
	160	/**
	161	* Generates the classifier.
	162	*
	163	* @param instances set of instances serving as training data
	164	* @throws Exception if the classifier has not been generated successfully
	165	*/
	166	public void buildClassifier(Instances instances) throws Exception
	167	{
	168	// can classifier handle the data?
	169	getCapabilities().testWithFail(instances);
	170
	171	// remove instances with missing class
	172	instances = new Instances(instances);
	173	instances.deleteWithMissingClass();
	174
	175	m_headerInfo = new Instances(instances, 0);
	176	m_numClasses = instances.numClasses();
	177	m_numAttributes = instances.numAttributes();
	178	m_probOfWordGivenClass = new double[m_numClasses][];
	179
	180	/*
	181	initialising the matrix of word counts
	182	NOTE: Laplace estimator introduced in case a word that does not appear for a class in the
	183	training set does so for the test set
	184	*/
	185	for(int c = 0; c<m_numClasses; c++)
	186	{
	187	m_probOfWordGivenClass[c] = new double[m_numAttributes];
	188	for(int att = 0; att<m_numAttributes; att++)
	189	{
	190	m_probOfWordGivenClass[c][att] = 1;
	191	}
	192	}
	193
	194	//enumerate through the instances
	195	Instance instance;
	196	int classIndex;
	197	double numOccurences;
	198	double[] docsPerClass = new double[m_numClasses];
	199	double[] wordsPerClass = new double[m_numClasses];
	200
	201	java.util.Enumeration enumInsts = instances.enumerateInstances();
	202	while (enumInsts.hasMoreElements())
	203	{
	204	instance = (Instance) enumInsts.nextElement();
	205	classIndex = (int)instance.value(instance.classIndex());
	206	docsPerClass[classIndex] += instance.weight();
	207
	208	for(int a = 0; a<instance.numValues(); a++)
	209	if(instance.index(a) != instance.classIndex())
	210	{
	211	if(!instance.isMissing(a))
	212	{
	213	numOccurences = instance.valueSparse(a) * instance.weight();
	214	if(numOccurences < 0)
	215	throw new Exception("Numeric attribute values must all be greater or equal to zero.");
	216	wordsPerClass[classIndex] += numOccurences;
	217	m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences;
	218	}
	219	}
	220	}
	221
	222	/*
	223	normalising probOfWordGivenClass values
	224	and saving each value as the log of each value
	225	*/
	226	for(int c = 0; c<m_numClasses; c++)
	227	for(int v = 0; v<m_numAttributes; v++)
	228	m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1));
	229
	230	/*
	231	calculating Pr(H)
	232	NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of
	233	training instances
	234	*/
	235	final double numDocs = instances.sumOfWeights() + m_numClasses;
	236	m_probOfClass = new double[m_numClasses];
	237	for(int h=0; h<m_numClasses; h++)
	238	m_probOfClass[h] = (double)(docsPerClass[h] + 1)/numDocs;
	239	}
	240
	241	/**
	242	* Calculates the class membership probabilities for the given test
	243	* instance.
	244	*
	245	* @param instance the instance to be classified
	246	* @return predicted class probability distribution
	247	* @throws Exception if there is a problem generating the prediction
	248	*/
	249	public double [] distributionForInstance(Instance instance) throws Exception
	250	{
	251	double[] probOfClassGivenDoc = new double[m_numClasses];
	252
	253	//calculate the array of log(Pr[D\|C])
	254	double[] logDocGivenClass = new double[m_numClasses];
	255	for(int h = 0; h<m_numClasses; h++)
	256	logDocGivenClass[h] = probOfDocGivenClass(instance, h);
	257
	258	double max = logDocGivenClass[Utils.maxIndex(logDocGivenClass)];
	259	double probOfDoc = 0.0;
	260
	261	for(int i = 0; i<m_numClasses; i++)
	262	{
	263	probOfClassGivenDoc[i] = Math.exp(logDocGivenClass[i] - max) * m_probOfClass[i];
	264	probOfDoc += probOfClassGivenDoc[i];
	265	}
	266
	267	Utils.normalize(probOfClassGivenDoc,probOfDoc);
	268
	269	return probOfClassGivenDoc;
	270	}
	271
	272	/**
	273	* log(N!) + (for all the words)(log(Pi^ni) - log(ni!))
	274	*
	275	* where
	276	* N is the total number of words
	277	* Pi is the probability of obtaining word i
	278	* ni is the number of times the word at index i occurs in the document
	279	*
	280	* @param inst The instance to be classified
	281	* @param classIndex The index of the class we are calculating the probability with respect to
	282	*
	283	* @return The log of the probability of the document occuring given the class
	284	*/
	285
	286	private double probOfDocGivenClass(Instance inst, int classIndex)
	287	{
	288	double answer = 0;
	289	//double totalWords = 0; //no need as we are not calculating the factorial at all.
	290
	291	double freqOfWordInDoc; //should be double
	292	for(int i = 0; i<inst.numValues(); i++)
	293	if(inst.index(i) != inst.classIndex())
	294	{
	295	freqOfWordInDoc = inst.valueSparse(i);
	296	//totalWords += freqOfWordInDoc;
	297	answer += (freqOfWordInDoc * m_probOfWordGivenClass[classIndex][inst.index(i)]
	298	); //- lnFactorial(freqOfWordInDoc));
	299	}
	300
	301	//answer += lnFactorial(totalWords);//The factorial terms don't make
	302	//any difference to the classifier's
	303	//accuracy, so not needed.
	304
	305	return answer;
	306	}
	307
	308	/**
	309	* Fast computation of ln(n!) for non-negative ints
	310	*
	311	* negative ints are passed on to the general gamma-function
	312	* based version in weka.core.SpecialFunctions
	313	*
	314	* if the current n value is higher than any previous one,
	315	* the cache is extended and filled to cover it
	316	*
	317	* the common case is reduced to a simple array lookup
	318	*
	319	* @param n the integer
	320	* @return ln(n!)
	321	*/
	322
	323	public double lnFactorial(int n)
	324	{
	325	if (n < 0) return weka.core.SpecialFunctions.lnFactorial(n);
	326
	327	if (m_lnFactorialCache.length <= n) {
	328	double[] tmp = new double[n+1];
	329	System.arraycopy(m_lnFactorialCache,0,tmp,0,m_lnFactorialCache.length);
	330	for(int i = m_lnFactorialCache.length; i < tmp.length; i++)
	331	tmp[i] = tmp[i-1] + Math.log(i);
	332	m_lnFactorialCache = tmp;
	333	}
	334
	335	return m_lnFactorialCache[n];
	336	}
	337
	338	/**
	339	* Returns a string representation of the classifier.
	340	*
	341	* @return a string representation of the classifier
	342	*/
	343	public String toString()
	344	{
	345	StringBuffer result = new StringBuffer("The independent probability of a class\n--------------------------------------\n");
	346
	347	for(int c = 0; c<m_numClasses; c++)
	348	result.append(m_headerInfo.classAttribute().value(c)).append("\t").append(Double.toString(m_probOfClass[c])).append("\n");
	349
	350	result.append("\nThe probability of a word given the class\n-----------------------------------------\n\t");
	351
	352	for(int c = 0; c<m_numClasses; c++)
	353	result.append(m_headerInfo.classAttribute().value(c)).append("\t");
	354
	355	result.append("\n");
	356
	357	for(int w = 0; w<m_numAttributes; w++)
	358	{
	359	result.append(m_headerInfo.attribute(w).name()).append("\t");
	360	for(int c = 0; c<m_numClasses; c++)
	361	result.append(Double.toString(Math.exp(m_probOfWordGivenClass[c][w]))).append("\t");
	362	result.append("\n");
	363	}
	364
	365	return result.toString();
	366	}
	367
	368	/**
	369	* Returns the revision string.
	370	*
	371	* @return the revision
	372	*/
	373	public String getRevision() {
	374	return RevisionUtils.extract("$Revision: 5928 $");
	375	}
	376
	377	/**
	378	* Main method for testing this class.
	379	*
	380	* @param argv the options
	381	*/
	382	public static void main(String [] argv) {
	383	runClassifier(new NaiveBayesMultinomial(), argv);
	384	}
	385	}
	386

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/classifiers/bayes/NaiveBayesMultinomial.java

Download in other formats: