Context Navigation

source: src/main/java/weka/attributeSelection/SignificanceAttributeEval.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 15 years ago
Import di weka.
File size: 16.4 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* SignificanceAttributeEval.java
19	* Copyright (C) 2009 Adrian Pino
20	* Copyright (C) 2009 University of Waikato, Hamilton, NZ
21	*
22	*/
23	package weka.attributeSelection;
24
25	import java.util.ArrayList;
26	import java.util.Enumeration;
27	import java.util.List;
28	import java.util.Vector;
29
30	import weka.core.Capabilities;
31	import weka.core.Instance;
32	import weka.core.Instances;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.RevisionUtils;
36	import weka.core.TechnicalInformation;
37	import weka.core.TechnicalInformationHandler;
38	import weka.core.Utils;
39	import weka.core.Capabilities.Capability;
40	import weka.core.TechnicalInformation.Field;
41	import weka.core.TechnicalInformation.Type;
42	import weka.filters.Filter;
43	import weka.filters.supervised.attribute.Discretize;
44
45	/**
46	<!-- globalinfo-start -->
47	* Significance :<br/>
48	* <br/>
49	* Evaluates the worth of an attribute by computing the Probabilistic Significance as a two-way function.<br/>
50	* (attribute-classes and classes-attribute association)<br/>
51	* <br/>
52	* For more information see:<br/>
53	* <br/>
54	* Amir Ahmad, Lipika Dey (2004). A feature selection technique for classificatory analysis.
55	* <p/>
56	<!-- globalinfo-end -->
57	*
58	<!-- options-start -->
59	* Valid options are: <p/>
60	*
61	* <pre> -M
62	* treat missing values as a separate value.</pre>
63	*
64	<!-- options-end -->
65	*
66	<!-- technical-bibtex-start -->
67	* BibTeX:
68	* <pre>
69	* @phdthesis{Ahmad2004,
70	* author = {Amir Ahmad and Lipika Dey},
71	* month = {October},
72	* publisher = {ELSEVIER},
73	* title = {A feature selection technique for classificatory analysis},
74	* year = {2004}
75	* }
76	* </pre>
77	* <p/>
78	<!-- technical-bibtex-end -->
79	*
80	* @author Adrian Pino (apinoa@facinf.uho.edu.cu)
81	* @version $Revision: 5447 $
82	*/
83	public class SignificanceAttributeEval
84	extends ASEvaluation
85	implements AttributeEvaluator, OptionHandler, TechnicalInformationHandler {
86
87	/** for serialization */
88	static final long serialVersionUID = -8504656625598579926L;
89
90	/** The training instances */
91	private Instances m_trainInstances;
92
93	/** The class index */
94	private int m_classIndex;
95
96	/** The number of attributes */
97	private int m_numAttribs;
98
99	/** The number of instances */
100	private int m_numInstances;
101
102	/** The number of classes */
103	private int m_numClasses;
104
105	/** Merge missing values */
106	private boolean m_missing_merge;
107
108	/**
109	* Returns a string describing this attribute evaluator
110	* @return a description of the evaluator suitable for
111	* displaying in the explorer/experimenter gui
112	*/
113	public String globalInfo() {
114	return "Significance :\n\nEvaluates the worth of an attribute "
115	+"by computing the Probabilistic Significance as a two-way function.\n"
116	+"(atributte-classes and classes-atribute association)\n\n"
117	+ "For more information see:\n\n"
118	+ getTechnicalInformation().toString();
119	}
120
121	/**
122	* Returns an instance of a TechnicalInformation object, containing
123	* detailed information about the technical background of this class,
124	* e.g., paper reference or book this class is based on.
125	*
126	* @return the technical information about this class
127	*/
128	public TechnicalInformation getTechnicalInformation() {
129	TechnicalInformation result;
130
131	result = new TechnicalInformation(Type.PHDTHESIS);
132	result.setValue(Field.AUTHOR, "Amir Ahmad and Lipika Dey");
133	result.setValue(Field.YEAR, "2004");
134	result.setValue(Field.MONTH, "October");
135	result.setValue(Field.TITLE, "A feature selection technique for classificatory analysis");
136	result.setValue(Field.PUBLISHER, "ELSEVIER");
137
138	return result;
139	}
140
141
142	/**
143	* Constructor
144	*/
145	public SignificanceAttributeEval () {
146	resetOptions();
147	}
148
149
150	/**
151	* Returns an enumeration describing the available options.
152	* @return an enumeration of all the available options.
153	**/
154	public Enumeration listOptions () {
155	Vector newVector = new Vector(1);
156	newVector.addElement(new Option("\ttreat missing values as a separate "
157	+ "value.", "M", 0, "-M"));
158	return newVector.elements();
159	}
160
161
162	/**
163	* Parses a given list of options. <p/>
164	*
165	<!-- options-start -->
166	* Valid options are: <p/>
167	*
168	* <pre> -M
169	* treat missing values as a separate value.</pre>
170	*
171	<!-- options-end -->
172	*
173	* @param options the list of options as an array of strings
174	* @throws Exception if an option is not supported
175	**/
176	public void setOptions (String[] options)
177	throws Exception {
178	resetOptions();
179	setMissingMerge(!(Utils.getFlag('M', options)));
180	}
181
182	/**
183	* Returns the tip text for this property
184	* @return tip text for this property suitable for
185	* displaying in the explorer/experimenter gui
186	*/
187	public String missingMergeTipText() {
188	return "Distribute counts for missing values. Counts are distributed "
189	+"across other values in proportion to their frequency. Otherwise, "
190	+"missing is treated as a separate value.";
191	}
192
193	/**
194	* distribute the counts for missing values across observed values
195	*
196	* @param b true=distribute missing values.
197	*/
198	public void setMissingMerge (boolean b) {
199	m_missing_merge = b;
200	}
201
202
203	/**
204	* get whether missing values are being distributed or not
205	*
206	* @return true if missing values are being distributed.
207	*/
208	public boolean getMissingMerge () {
209	return m_missing_merge;
210	}
211
212
213	/**
214	* Gets the current settings of WrapperSubsetEval.
215	* @return an array of strings suitable for passing to setOptions()
216	*/
217	public String[] getOptions () {
218	String[] options = new String[1];
219	int current = 0;
220
221	if (!getMissingMerge()) {
222	options[current++] = "-M";
223	}
224
225	while (current < options.length) {
226	options[current++] = "";
227	}
228
229	return options;
230	}
231
232	/**
233	* Returns the capabilities of this evaluator.
234	*
235	* @return the capabilities of this evaluator
236	* @see Capabilities
237	*/
238	public Capabilities getCapabilities() {
239	Capabilities result = super.getCapabilities();
240	result.disableAll();
241
242	// attributes
243	result.enable(Capability.NOMINAL_ATTRIBUTES);
244	result.enable(Capability.NUMERIC_ATTRIBUTES);
245	result.enable(Capability.DATE_ATTRIBUTES);
246	result.enable(Capability.MISSING_VALUES);
247
248	// class
249	result.enable(Capability.NOMINAL_CLASS);
250	result.enable(Capability.MISSING_CLASS_VALUES);
251
252	return result;
253	}
254
255	/**
256	* Initializes the Significance attribute evaluator.
257	* Discretizes all attributes that are numeric.
258	*
259	* @param data set of instances serving as training data
260	* @throws Exception if the evaluator has not been
261	* generated successfully
262	*/
263	public void buildEvaluator (Instances data)
264	throws Exception {
265
266	// can evaluator handle data?
267	getCapabilities().testWithFail(data);
268
269	m_trainInstances = data;
270	m_classIndex = m_trainInstances.classIndex();
271	m_numAttribs = m_trainInstances.numAttributes();
272	m_numInstances = m_trainInstances.numInstances();
273	Discretize disTransform = new Discretize();
274	disTransform.setUseBetterEncoding(true);
275	disTransform.setInputFormat(m_trainInstances);
276	m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
277	m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
278	}
279
280
281	/**
282	* reset options to default values
283	*/
284	protected void resetOptions () {
285	m_trainInstances = null;
286	m_missing_merge = true;
287	}
288
289
290	/**
291	* evaluates an individual attribute by measuring the Significance
292	*
293	* @param attribute the index of the attribute to be evaluated
294	* @return the Significance of the attribute in the data base
295	* @throws Exception if the attribute could not be evaluated
296	*/
297	public double evaluateAttribute (int attribute)
298	throws Exception {
299	int i, j, ii, jj;
300	int ni, nj;
301	double sum = 0.0;
302	ni = m_trainInstances.attribute(attribute).numValues() + 1;
303	nj = m_numClasses + 1;
304	double[] sumi, sumj;
305	Instance inst;
306	double temp = 0.0;
307	sumi = new double[ni];
308	sumj = new double[nj];
309	double[][] counts = new double[ni][nj];
310
311	for (i = 0; i < ni; i++) {
312	sumi[i] = 0.0;
313
314	for (j = 0; j < nj; j++) {
315	sumj[j] = 0.0;
316	counts[i][j] = 0.0;
317	}
318	}
319
320	// Fill the contingency table
321	for (i = 0; i < m_numInstances; i++) {
322	inst = m_trainInstances.instance(i);
323
324	if (inst.isMissing(attribute)) {
325	ii = ni - 1;
326	}
327	else {
328	ii = (int)inst.value(attribute);
329	}
330
331	if (inst.isMissing(m_classIndex)) {
332	jj = nj - 1;
333	}
334	else {
335	jj = (int)inst.value(m_classIndex);
336	}
337
338	counts[ii][jj]++;
339	}
340
341	// get the row totals
342	for (i = 0; i < ni; i++) {
343	sumi[i] = 0.0;
344
345	for (j = 0; j < nj; j++) {
346	sumi[i] += counts[i][j];
347	sum += counts[i][j];
348	}
349	}
350
351	// get the column totals
352	for (j = 0; j < nj; j++) {
353	sumj[j] = 0.0;
354
355	for (i = 0; i < ni; i++) {
356	sumj[j] += counts[i][j];
357	}
358	}
359
360
361	// distribute missing counts
362	if (m_missing_merge &&
363	(sumi[ni-1] < m_numInstances) &&
364	(sumj[nj-1] < m_numInstances)) {
365	double[] i_copy = new double[sumi.length];
366	double[] j_copy = new double[sumj.length];
367	double[][] counts_copy = new double[sumi.length][sumj.length];
368
369	for (i = 0; i < ni; i++) {
370	System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
371	}
372
373	System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
374	System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
375	double total_missing = (sumi[ni - 1] + sumj[nj - 1] -
376	counts[ni - 1][nj - 1]);
377
378	// do the missing i's
379	if (sumi[ni - 1] > 0.0) {
380	for (j = 0; j < nj - 1; j++) {
381	if (counts[ni - 1][j] > 0.0) {
382	for (i = 0; i < ni - 1; i++) {
383	temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
384	counts[i][j] += temp;
385	sumi[i] += temp;
386	}
387
388	counts[ni - 1][j] = 0.0;
389	}
390	}
391	}
392
393	sumi[ni - 1] = 0.0;
394
395	// do the missing j's
396	if (sumj[nj - 1] > 0.0) {
397	for (i = 0; i < ni - 1; i++) {
398	if (counts[i][nj - 1] > 0.0) {
399	for (j = 0; j < nj - 1; j++) {
400	temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
401	counts[i][j] += temp;
402	sumj[j] += temp;
403	}
404
405	counts[i][nj - 1] = 0.0;
406	}
407	}
408	}
409
410	sumj[nj - 1] = 0.0;
411
412	// do the both missing
413	if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
414	for (i = 0; i < ni - 1; i++) {
415	for (j = 0; j < nj - 1; j++) {
416	temp = (counts_copy[i][j]/(sum - total_missing)) *
417	counts_copy[ni - 1][nj - 1];
418	counts[i][j] += temp;
419	sumi[i] += temp;
420	sumj[j] += temp;
421	}
422	}
423
424	counts[ni - 1][nj - 1] = 0.0;
425	}
426	}
427
428	/Working on the ContingencyTables**/
429	double discriminatingPower = associationAttributeClasses(counts);
430	double separability = associationClassesAttribute(counts);
431	/.../
432
433
434	return discriminatingPower + separability / 2;
435	}
436
437	/**
438	* evaluates an individual attribute by measuring the attribute-classes
439	* association
440	*
441	* @param counts the Contingency table where are the frecuency counts values
442	* @return the discriminating power of the attribute
443	*/
444	public double associationAttributeClasses(double[][] counts){
445
446	List<Integer> supportSet = new ArrayList<Integer>();
447	List<Integer> not_supportSet = new ArrayList<Integer>();
448
449	double discriminatingPower = 0;
450
451
452	int numValues = counts.length;
453	int numClasses = counts[0].length;
454
455	int total = 0;
456
457	double[] sumRows = new double[numValues];
458	double[] sumCols = new double[numClasses];
459
460	// get the row totals
461	for (int i = 0; i < numValues; i++) {
462	sumRows[i] = 0.0;
463
464	for (int j = 0; j < numClasses; j++) {
465	sumRows[i] += counts[i][j];
466	total += counts[i][j];
467	}
468	}
469
470	// get the column totals
471	for (int j = 0; j < numClasses; j++) {
472	sumCols[j] = 0.0;
473
474	for (int i = 0; i < numValues; i++) {
475	sumCols[j] += counts[i][j];
476	}
477	}
478
479	for (int i = 0; i < numClasses; i++) {
480	for (int j = 0; j < numValues; j++) {
481
482	//Computing Conditional Probability P(Clasei \| Valuej)
483	double numerator1 = counts[j][i];
484	double denominator1 = sumRows[j];
485	double result1;
486
487	if(denominator1 != 0)
488	result1 = numerator1/denominator1;
489	else
490	result1 = 0;
491
492	//Computing Conditional Probability P(Clasei \| ^Valuej)
493	double numerator2 = sumCols[i] - counts[j][i];
494	double denominator2 = total - sumRows[j];
495	double result2;
496
497	if(denominator2 != 0)
498	result2 = numerator2/denominator2;
499	else
500	result2 = 0;
501
502
503	if(result1 > result2){
504	supportSet.add (i);
505	discriminatingPower +=result1;
506	}
507	else{
508	not_supportSet.add (i);
509	discriminatingPower +=result2;
510	}
511	}
512
513	}
514
515	return discriminatingPower/numValues - 1.0;
516	}
517
518	/**
519	* evaluates an individual attribute by measuring the classes-attribute
520	* association
521	*
522	* @param counts the Contingency table where are the frecuency counts values
523	* @return the separability power of the classes
524	*/
525	public double associationClassesAttribute(double[][] counts){
526
527	List<Integer> supportSet = new ArrayList<Integer>();
528	List<Integer> not_supportSet = new ArrayList<Integer>();
529
530	double separability = 0;
531
532
533	int numValues = counts.length;
534	int numClasses = counts[0].length;
535
536	int total = 0;
537
538	double[] sumRows = new double[numValues];
539	double[] sumCols = new double[numClasses];
540
541	// get the row totals
542	for (int i = 0; i < numValues; i++) {
543	sumRows[i] = 0.0;
544
545	for (int j = 0; j < numClasses; j++) {
546	sumRows[i] += counts[i][j];
547	total += counts[i][j];
548	}
549	}
550
551	// get the column totals
552	for (int j = 0; j < numClasses; j++) {
553	sumCols[j] = 0.0;
554
555	for (int i = 0; i < numValues; i++) {
556	sumCols[j] += counts[i][j];
557	}
558	}
559
560	for (int i = 0; i < numValues; i++) {
561	for (int j = 0; j < numClasses; j++) {
562
563	//Computing Conditional Probability P(Valuei \| Clasej)
564	double numerator1 = counts[i][j];
565	double denominator1 = sumCols[j];
566	double result1;
567
568	if(denominator1 != 0)
569	result1 = numerator1/denominator1;
570	else
571	result1 = 0;
572
573	//Computing Conditional Probability P(Valuei \| ^Clasej)
574	double numerator2 = sumRows[i] - counts[i][j];
575	double denominator2 = total - sumCols[j];
576	double result2;
577
578	if(denominator2 != 0)
579	result2 = numerator2/denominator2;
580	else
581	result2 = 0;
582
583
584	if(result1 > result2){
585	supportSet.add (i);
586	separability +=result1;
587	}
588	else{
589	not_supportSet.add (i);
590	separability +=result2;
591	}
592	}
593
594	}
595
596	return separability/numClasses - 1.0;
597	}
598
599
600	/**
601	* Return a description of the evaluator
602	* @return description as a string
603	*/
604	public String toString () {
605	StringBuffer text = new StringBuffer();
606
607	if (m_trainInstances == null) {
608	text.append("\tSignificance evaluator has not been built");
609	}
610	else {
611	text.append("\tSignificance feature evaluator");
612
613	if (!m_missing_merge) {
614	text.append("\n\tMissing values treated as seperate");
615	}
616	}
617
618	text.append("\n");
619	return text.toString();
620	}
621
622	/**
623	* Returns the revision string.
624	*
625	* @return the revision
626	*/
627	public String getRevision() {
628	return RevisionUtils.extract("$Revision: 5447 $");
629	}
630
631	/**
632	* Main method for testing this class.
633	*
634	* @param args the options
635	*/
636	public static void main (String[] args) {
637	runEvaluator(new SignificanceAttributeEval(), args);
638	}
639	}
640

Note: See TracBrowser for help on using the repository browser.

Download in other formats: