Context Navigation

source: src/main/java/weka/attributeSelection/SignificanceAttributeEval.java @ 4

Last change on this file since 4 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 16.4 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* SignificanceAttributeEval.java
	19	* Copyright (C) 2009 Adrian Pino
	20	* Copyright (C) 2009 University of Waikato, Hamilton, NZ
	21	*
	22	*/
	23	package weka.attributeSelection;
	24
	25	import java.util.ArrayList;
	26	import java.util.Enumeration;
	27	import java.util.List;
	28	import java.util.Vector;
	29
	30	import weka.core.Capabilities;
	31	import weka.core.Instance;
	32	import weka.core.Instances;
	33	import weka.core.Option;
	34	import weka.core.OptionHandler;
	35	import weka.core.RevisionUtils;
	36	import weka.core.TechnicalInformation;
	37	import weka.core.TechnicalInformationHandler;
	38	import weka.core.Utils;
	39	import weka.core.Capabilities.Capability;
	40	import weka.core.TechnicalInformation.Field;
	41	import weka.core.TechnicalInformation.Type;
	42	import weka.filters.Filter;
	43	import weka.filters.supervised.attribute.Discretize;
	44
	45	/**
	46	<!-- globalinfo-start -->
	47	* Significance :<br/>
	48	* <br/>
	49	* Evaluates the worth of an attribute by computing the Probabilistic Significance as a two-way function.<br/>
	50	* (attribute-classes and classes-attribute association)<br/>
	51	* <br/>
	52	* For more information see:<br/>
	53	* <br/>
	54	* Amir Ahmad, Lipika Dey (2004). A feature selection technique for classificatory analysis.
	55	* <p/>
	56	<!-- globalinfo-end -->
	57	*
	58	<!-- options-start -->
	59	* Valid options are: <p/>
	60	*
	61	* <pre> -M
	62	* treat missing values as a separate value.</pre>
	63	*
	64	<!-- options-end -->
	65	*
	66	<!-- technical-bibtex-start -->
	67	* BibTeX:
	68	* <pre>
	69	* @phdthesis{Ahmad2004,
	70	* author = {Amir Ahmad and Lipika Dey},
	71	* month = {October},
	72	* publisher = {ELSEVIER},
	73	* title = {A feature selection technique for classificatory analysis},
	74	* year = {2004}
	75	* }
	76	* </pre>
	77	* <p/>
	78	<!-- technical-bibtex-end -->
	79	*
	80	* @author Adrian Pino (apinoa@facinf.uho.edu.cu)
	81	* @version $Revision: 5447 $
	82	*/
	83	public class SignificanceAttributeEval
	84	extends ASEvaluation
	85	implements AttributeEvaluator, OptionHandler, TechnicalInformationHandler {
	86
	87	/** for serialization */
	88	static final long serialVersionUID = -8504656625598579926L;
	89
	90	/** The training instances */
	91	private Instances m_trainInstances;
	92
	93	/** The class index */
	94	private int m_classIndex;
	95
	96	/** The number of attributes */
	97	private int m_numAttribs;
	98
	99	/** The number of instances */
	100	private int m_numInstances;
	101
	102	/** The number of classes */
	103	private int m_numClasses;
	104
	105	/** Merge missing values */
	106	private boolean m_missing_merge;
	107
	108	/**
	109	* Returns a string describing this attribute evaluator
	110	* @return a description of the evaluator suitable for
	111	* displaying in the explorer/experimenter gui
	112	*/
	113	public String globalInfo() {
	114	return "Significance :\n\nEvaluates the worth of an attribute "
	115	+"by computing the Probabilistic Significance as a two-way function.\n"
	116	+"(atributte-classes and classes-atribute association)\n\n"
	117	+ "For more information see:\n\n"
	118	+ getTechnicalInformation().toString();
	119	}
	120
	121	/**
	122	* Returns an instance of a TechnicalInformation object, containing
	123	* detailed information about the technical background of this class,
	124	* e.g., paper reference or book this class is based on.
	125	*
	126	* @return the technical information about this class
	127	*/
	128	public TechnicalInformation getTechnicalInformation() {
	129	TechnicalInformation result;
	130
	131	result = new TechnicalInformation(Type.PHDTHESIS);
	132	result.setValue(Field.AUTHOR, "Amir Ahmad and Lipika Dey");
	133	result.setValue(Field.YEAR, "2004");
	134	result.setValue(Field.MONTH, "October");
	135	result.setValue(Field.TITLE, "A feature selection technique for classificatory analysis");
	136	result.setValue(Field.PUBLISHER, "ELSEVIER");
	137
	138	return result;
	139	}
	140
	141
	142	/**
	143	* Constructor
	144	*/
	145	public SignificanceAttributeEval () {
	146	resetOptions();
	147	}
	148
	149
	150	/**
	151	* Returns an enumeration describing the available options.
	152	* @return an enumeration of all the available options.
	153	**/
	154	public Enumeration listOptions () {
	155	Vector newVector = new Vector(1);
	156	newVector.addElement(new Option("\ttreat missing values as a separate "
	157	+ "value.", "M", 0, "-M"));
	158	return newVector.elements();
	159	}
	160
	161
	162	/**
	163	* Parses a given list of options. <p/>
	164	*
	165	<!-- options-start -->
	166	* Valid options are: <p/>
	167	*
	168	* <pre> -M
	169	* treat missing values as a separate value.</pre>
	170	*
	171	<!-- options-end -->
	172	*
	173	* @param options the list of options as an array of strings
	174	* @throws Exception if an option is not supported
	175	**/
	176	public void setOptions (String[] options)
	177	throws Exception {
	178	resetOptions();
	179	setMissingMerge(!(Utils.getFlag('M', options)));
	180	}
	181
	182	/**
	183	* Returns the tip text for this property
	184	* @return tip text for this property suitable for
	185	* displaying in the explorer/experimenter gui
	186	*/
	187	public String missingMergeTipText() {
	188	return "Distribute counts for missing values. Counts are distributed "
	189	+"across other values in proportion to their frequency. Otherwise, "
	190	+"missing is treated as a separate value.";
	191	}
	192
	193	/**
	194	* distribute the counts for missing values across observed values
	195	*
	196	* @param b true=distribute missing values.
	197	*/
	198	public void setMissingMerge (boolean b) {
	199	m_missing_merge = b;
	200	}
	201
	202
	203	/**
	204	* get whether missing values are being distributed or not
	205	*
	206	* @return true if missing values are being distributed.
	207	*/
	208	public boolean getMissingMerge () {
	209	return m_missing_merge;
	210	}
	211
	212
	213	/**
	214	* Gets the current settings of WrapperSubsetEval.
	215	* @return an array of strings suitable for passing to setOptions()
	216	*/
	217	public String[] getOptions () {
	218	String[] options = new String[1];
	219	int current = 0;
	220
	221	if (!getMissingMerge()) {
	222	options[current++] = "-M";
	223	}
	224
	225	while (current < options.length) {
	226	options[current++] = "";
	227	}
	228
	229	return options;
	230	}
	231
	232	/**
	233	* Returns the capabilities of this evaluator.
	234	*
	235	* @return the capabilities of this evaluator
	236	* @see Capabilities
	237	*/
	238	public Capabilities getCapabilities() {
	239	Capabilities result = super.getCapabilities();
	240	result.disableAll();
	241
	242	// attributes
	243	result.enable(Capability.NOMINAL_ATTRIBUTES);
	244	result.enable(Capability.NUMERIC_ATTRIBUTES);
	245	result.enable(Capability.DATE_ATTRIBUTES);
	246	result.enable(Capability.MISSING_VALUES);
	247
	248	// class
	249	result.enable(Capability.NOMINAL_CLASS);
	250	result.enable(Capability.MISSING_CLASS_VALUES);
	251
	252	return result;
	253	}
	254
	255	/**
	256	* Initializes the Significance attribute evaluator.
	257	* Discretizes all attributes that are numeric.
	258	*
	259	* @param data set of instances serving as training data
	260	* @throws Exception if the evaluator has not been
	261	* generated successfully
	262	*/
	263	public void buildEvaluator (Instances data)
	264	throws Exception {
	265
	266	// can evaluator handle data?
	267	getCapabilities().testWithFail(data);
	268
	269	m_trainInstances = data;
	270	m_classIndex = m_trainInstances.classIndex();
	271	m_numAttribs = m_trainInstances.numAttributes();
	272	m_numInstances = m_trainInstances.numInstances();
	273	Discretize disTransform = new Discretize();
	274	disTransform.setUseBetterEncoding(true);
	275	disTransform.setInputFormat(m_trainInstances);
	276	m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
	277	m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
	278	}
	279
	280
	281	/**
	282	* reset options to default values
	283	*/
	284	protected void resetOptions () {
	285	m_trainInstances = null;
	286	m_missing_merge = true;
	287	}
	288
	289
	290	/**
	291	* evaluates an individual attribute by measuring the Significance
	292	*
	293	* @param attribute the index of the attribute to be evaluated
	294	* @return the Significance of the attribute in the data base
	295	* @throws Exception if the attribute could not be evaluated
	296	*/
	297	public double evaluateAttribute (int attribute)
	298	throws Exception {
	299	int i, j, ii, jj;
	300	int ni, nj;
	301	double sum = 0.0;
	302	ni = m_trainInstances.attribute(attribute).numValues() + 1;
	303	nj = m_numClasses + 1;
	304	double[] sumi, sumj;
	305	Instance inst;
	306	double temp = 0.0;
	307	sumi = new double[ni];
	308	sumj = new double[nj];
	309	double[][] counts = new double[ni][nj];
	310
	311	for (i = 0; i < ni; i++) {
	312	sumi[i] = 0.0;
	313
	314	for (j = 0; j < nj; j++) {
	315	sumj[j] = 0.0;
	316	counts[i][j] = 0.0;
	317	}
	318	}
	319
	320	// Fill the contingency table
	321	for (i = 0; i < m_numInstances; i++) {
	322	inst = m_trainInstances.instance(i);
	323
	324	if (inst.isMissing(attribute)) {
	325	ii = ni - 1;
	326	}
	327	else {
	328	ii = (int)inst.value(attribute);
	329	}
	330
	331	if (inst.isMissing(m_classIndex)) {
	332	jj = nj - 1;
	333	}
	334	else {
	335	jj = (int)inst.value(m_classIndex);
	336	}
	337
	338	counts[ii][jj]++;
	339	}
	340
	341	// get the row totals
	342	for (i = 0; i < ni; i++) {
	343	sumi[i] = 0.0;
	344
	345	for (j = 0; j < nj; j++) {
	346	sumi[i] += counts[i][j];
	347	sum += counts[i][j];
	348	}
	349	}
	350
	351	// get the column totals
	352	for (j = 0; j < nj; j++) {
	353	sumj[j] = 0.0;
	354
	355	for (i = 0; i < ni; i++) {
	356	sumj[j] += counts[i][j];
	357	}
	358	}
	359
	360
	361	// distribute missing counts
	362	if (m_missing_merge &&
	363	(sumi[ni-1] < m_numInstances) &&
	364	(sumj[nj-1] < m_numInstances)) {
	365	double[] i_copy = new double[sumi.length];
	366	double[] j_copy = new double[sumj.length];
	367	double[][] counts_copy = new double[sumi.length][sumj.length];
	368
	369	for (i = 0; i < ni; i++) {
	370	System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
	371	}
	372
	373	System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
	374	System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
	375	double total_missing = (sumi[ni - 1] + sumj[nj - 1] -
	376	counts[ni - 1][nj - 1]);
	377
	378	// do the missing i's
	379	if (sumi[ni - 1] > 0.0) {
	380	for (j = 0; j < nj - 1; j++) {
	381	if (counts[ni - 1][j] > 0.0) {
	382	for (i = 0; i < ni - 1; i++) {
	383	temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
	384	counts[i][j] += temp;
	385	sumi[i] += temp;
	386	}
	387
	388	counts[ni - 1][j] = 0.0;
	389	}
	390	}
	391	}
	392
	393	sumi[ni - 1] = 0.0;
	394
	395	// do the missing j's
	396	if (sumj[nj - 1] > 0.0) {
	397	for (i = 0; i < ni - 1; i++) {
	398	if (counts[i][nj - 1] > 0.0) {
	399	for (j = 0; j < nj - 1; j++) {
	400	temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
	401	counts[i][j] += temp;
	402	sumj[j] += temp;
	403	}
	404
	405	counts[i][nj - 1] = 0.0;
	406	}
	407	}
	408	}
	409
	410	sumj[nj - 1] = 0.0;
	411
	412	// do the both missing
	413	if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
	414	for (i = 0; i < ni - 1; i++) {
	415	for (j = 0; j < nj - 1; j++) {
	416	temp = (counts_copy[i][j]/(sum - total_missing)) *
	417	counts_copy[ni - 1][nj - 1];
	418	counts[i][j] += temp;
	419	sumi[i] += temp;
	420	sumj[j] += temp;
	421	}
	422	}
	423
	424	counts[ni - 1][nj - 1] = 0.0;
	425	}
	426	}
	427
	428	/Working on the ContingencyTables**/
	429	double discriminatingPower = associationAttributeClasses(counts);
	430	double separability = associationClassesAttribute(counts);
	431	/.../
	432
	433
	434	return discriminatingPower + separability / 2;
	435	}
	436
	437	/**
	438	* evaluates an individual attribute by measuring the attribute-classes
	439	* association
	440	*
	441	* @param counts the Contingency table where are the frecuency counts values
	442	* @return the discriminating power of the attribute
	443	*/
	444	public double associationAttributeClasses(double[][] counts){
	445
	446	List<Integer> supportSet = new ArrayList<Integer>();
	447	List<Integer> not_supportSet = new ArrayList<Integer>();
	448
	449	double discriminatingPower = 0;
	450
	451
	452	int numValues = counts.length;
	453	int numClasses = counts[0].length;
	454
	455	int total = 0;
	456
	457	double[] sumRows = new double[numValues];
	458	double[] sumCols = new double[numClasses];
	459
	460	// get the row totals
	461	for (int i = 0; i < numValues; i++) {
	462	sumRows[i] = 0.0;
	463
	464	for (int j = 0; j < numClasses; j++) {
	465	sumRows[i] += counts[i][j];
	466	total += counts[i][j];
	467	}
	468	}
	469
	470	// get the column totals
	471	for (int j = 0; j < numClasses; j++) {
	472	sumCols[j] = 0.0;
	473
	474	for (int i = 0; i < numValues; i++) {
	475	sumCols[j] += counts[i][j];
	476	}
	477	}
	478
	479	for (int i = 0; i < numClasses; i++) {
	480	for (int j = 0; j < numValues; j++) {
	481
	482	//Computing Conditional Probability P(Clasei \| Valuej)
	483	double numerator1 = counts[j][i];
	484	double denominator1 = sumRows[j];
	485	double result1;
	486
	487	if(denominator1 != 0)
	488	result1 = numerator1/denominator1;
	489	else
	490	result1 = 0;
	491
	492	//Computing Conditional Probability P(Clasei \| ^Valuej)
	493	double numerator2 = sumCols[i] - counts[j][i];
	494	double denominator2 = total - sumRows[j];
	495	double result2;
	496
	497	if(denominator2 != 0)
	498	result2 = numerator2/denominator2;
	499	else
	500	result2 = 0;
	501
	502
	503	if(result1 > result2){
	504	supportSet.add (i);
	505	discriminatingPower +=result1;
	506	}
	507	else{
	508	not_supportSet.add (i);
	509	discriminatingPower +=result2;
	510	}
	511	}
	512
	513	}
	514
	515	return discriminatingPower/numValues - 1.0;
	516	}
	517
	518	/**
	519	* evaluates an individual attribute by measuring the classes-attribute
	520	* association
	521	*
	522	* @param counts the Contingency table where are the frecuency counts values
	523	* @return the separability power of the classes
	524	*/
	525	public double associationClassesAttribute(double[][] counts){
	526
	527	List<Integer> supportSet = new ArrayList<Integer>();
	528	List<Integer> not_supportSet = new ArrayList<Integer>();
	529
	530	double separability = 0;
	531
	532
	533	int numValues = counts.length;
	534	int numClasses = counts[0].length;
	535
	536	int total = 0;
	537
	538	double[] sumRows = new double[numValues];
	539	double[] sumCols = new double[numClasses];
	540
	541	// get the row totals
	542	for (int i = 0; i < numValues; i++) {
	543	sumRows[i] = 0.0;
	544
	545	for (int j = 0; j < numClasses; j++) {
	546	sumRows[i] += counts[i][j];
	547	total += counts[i][j];
	548	}
	549	}
	550
	551	// get the column totals
	552	for (int j = 0; j < numClasses; j++) {
	553	sumCols[j] = 0.0;
	554
	555	for (int i = 0; i < numValues; i++) {
	556	sumCols[j] += counts[i][j];
	557	}
	558	}
	559
	560	for (int i = 0; i < numValues; i++) {
	561	for (int j = 0; j < numClasses; j++) {
	562
	563	//Computing Conditional Probability P(Valuei \| Clasej)
	564	double numerator1 = counts[i][j];
	565	double denominator1 = sumCols[j];
	566	double result1;
	567
	568	if(denominator1 != 0)
	569	result1 = numerator1/denominator1;
	570	else
	571	result1 = 0;
	572
	573	//Computing Conditional Probability P(Valuei \| ^Clasej)
	574	double numerator2 = sumRows[i] - counts[i][j];
	575	double denominator2 = total - sumCols[j];
	576	double result2;
	577
	578	if(denominator2 != 0)
	579	result2 = numerator2/denominator2;
	580	else
	581	result2 = 0;
	582
	583
	584	if(result1 > result2){
	585	supportSet.add (i);
	586	separability +=result1;
	587	}
	588	else{
	589	not_supportSet.add (i);
	590	separability +=result2;
	591	}
	592	}
	593
	594	}
	595
	596	return separability/numClasses - 1.0;
	597	}
	598
	599
	600	/**
	601	* Return a description of the evaluator
	602	* @return description as a string
	603	*/
	604	public String toString () {
	605	StringBuffer text = new StringBuffer();
	606
	607	if (m_trainInstances == null) {
	608	text.append("\tSignificance evaluator has not been built");
	609	}
	610	else {
	611	text.append("\tSignificance feature evaluator");
	612
	613	if (!m_missing_merge) {
	614	text.append("\n\tMissing values treated as seperate");
	615	}
	616	}
	617
	618	text.append("\n");
	619	return text.toString();
	620	}
	621
	622	/**
	623	* Returns the revision string.
	624	*
	625	* @return the revision
	626	*/
	627	public String getRevision() {
	628	return RevisionUtils.extract("$Revision: 5447 $");
	629	}
	630
	631	/**
	632	* Main method for testing this class.
	633	*
	634	* @param args the options
	635	*/
	636	public static void main (String[] args) {
	637	runEvaluator(new SignificanceAttributeEval(), args);
	638	}
	639	}
	640

Note: See TracBrowser for help on using the repository browser.

Download in other formats: