Context Navigation

CfsSubsetEval.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 30.0 KB

Rev	Line
[29]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* CfsSubsetEval.java
	19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.attributeSelection;
	24
	25	import weka.core.Capabilities;
	26	import weka.core.ContingencyTables;
	27	import weka.core.Instance;
	28	import weka.core.Instances;
	29	import weka.core.Option;
	30	import weka.core.OptionHandler;
	31	import weka.core.RevisionUtils;
	32	import weka.core.TechnicalInformation;
	33	import weka.core.TechnicalInformationHandler;
	34	import weka.core.Utils;
	35	import weka.core.Capabilities.Capability;
	36	import weka.core.TechnicalInformation.Field;
	37	import weka.core.TechnicalInformation.Type;
	38	import weka.filters.Filter;
	39	import weka.filters.supervised.attribute.Discretize;
	40
	41	import java.util.BitSet;
	42	import java.util.Enumeration;
	43	import java.util.Vector;
	44
	45	/**
	46	<!-- globalinfo-start -->
	47	* CfsSubsetEval :<br/>
	48	* <br/>
	49	* Evaluates the worth of a subset of attributes by considering the individual predictive ability of each feature along with the degree of redundancy between them.<br/>
	50	* <br/>
	51	* Subsets of features that are highly correlated with the class while having low intercorrelation are preferred.<br/>
	52	* <br/>
	53	* For more information see:<br/>
	54	* <br/>
	55	* M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine Learning. Hamilton, New Zealand.
	56	* <p/>
	57	<!-- globalinfo-end -->
	58	*
	59	<!-- technical-bibtex-start -->
	60	* BibTeX:
	61	* <pre>
	62	* @phdthesis{Hall1998,
	63	* address = {Hamilton, New Zealand},
	64	* author = {M. A. Hall},
	65	* school = {University of Waikato},
	66	* title = {Correlation-based Feature Subset Selection for Machine Learning},
	67	* year = {1998}
	68	* }
	69	* </pre>
	70	* <p/>
	71	<!-- technical-bibtex-end -->
	72	*
	73	<!-- options-start -->
	74	* Valid options are: <p/>
	75	*
	76	* <pre> -M
	77	* Treat missing values as a separate value.</pre>
	78	*
	79	* <pre> -L
	80	* Don't include locally predictive attributes.</pre>
	81	*
	82	<!-- options-end -->
	83	*
	84	* @author Mark Hall (mhall@cs.waikato.ac.nz)
	85	* @version $Revision: 6132 $
	86	* @see Discretize
	87	*/
	88	public class CfsSubsetEval
	89	extends ASEvaluation
	90	implements SubsetEvaluator,
	91	OptionHandler,
	92	TechnicalInformationHandler {
	93
	94	/** for serialization */
	95	static final long serialVersionUID = 747878400813276317L;
	96
	97	/** The training instances */
	98	private Instances m_trainInstances;
	99	/** Discretise attributes when class in nominal */
	100	private Discretize m_disTransform;
	101	/** The class index */
	102	private int m_classIndex;
	103	/** Is the class numeric */
	104	private boolean m_isNumeric;
	105	/** Number of attributes in the training data */
	106	private int m_numAttribs;
	107	/** Number of instances in the training data */
	108	private int m_numInstances;
	109	/** Treat missing values as separate values */
	110	private boolean m_missingSeparate;
	111	/** Include locally predictive attributes */
	112	private boolean m_locallyPredictive;
	113	/** Holds the matrix of attribute correlations */
	114	// private Matrix m_corr_matrix;
	115	private float [][] m_corr_matrix;
	116	/** Standard deviations of attributes (when using pearsons correlation) */
	117	private double[] m_std_devs;
	118	/** Threshold for admitting locally predictive features */
	119	private double m_c_Threshold;
	120
	121	/**
	122	* Returns a string describing this attribute evaluator
	123	* @return a description of the evaluator suitable for
	124	* displaying in the explorer/experimenter gui
	125	*/
	126	public String globalInfo() {
	127	return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
	128	+"by considering the individual predictive ability of each feature "
	129	+"along with the degree of redundancy between them.\n\n"
	130	+"Subsets of features that are highly correlated with the class "
	131	+"while having low intercorrelation are preferred.\n\n"
	132	+ "For more information see:\n\n"
	133	+ getTechnicalInformation().toString();
	134	}
	135
	136	/**
	137	* Returns an instance of a TechnicalInformation object, containing
	138	* detailed information about the technical background of this class,
	139	* e.g., paper reference or book this class is based on.
	140	*
	141	* @return the technical information about this class
	142	*/
	143	public TechnicalInformation getTechnicalInformation() {
	144	TechnicalInformation result;
	145
	146	result = new TechnicalInformation(Type.PHDTHESIS);
	147	result.setValue(Field.AUTHOR, "M. A. Hall");
	148	result.setValue(Field.YEAR, "1998");
	149	result.setValue(Field.TITLE, "Correlation-based Feature Subset Selection for Machine Learning");
	150	result.setValue(Field.SCHOOL, "University of Waikato");
	151	result.setValue(Field.ADDRESS, "Hamilton, New Zealand");
	152
	153	return result;
	154	}
	155
	156	/**
	157	* Constructor
	158	*/
	159	public CfsSubsetEval () {
	160	resetOptions();
	161	}
	162
	163
	164	/**
	165	* Returns an enumeration describing the available options.
	166	* @return an enumeration of all the available options.
	167	*
	168	**/
	169	public Enumeration listOptions () {
	170	Vector newVector = new Vector(3);
	171	newVector.addElement(new Option("\tTreat missing values as a separate "
	172	+ "value.", "M", 0, "-M"));
	173	newVector.addElement(new Option("\tDon't include locally predictive attributes"
	174	+ ".", "L", 0, "-L"));
	175	return newVector.elements();
	176	}
	177
	178
	179	/**
	180	* Parses and sets a given list of options. <p/>
	181	*
	182	<!-- options-start -->
	183	* Valid options are: <p/>
	184	*
	185	* <pre> -M
	186	* Treat missing values as a separate value.</pre>
	187	*
	188	* <pre> -L
	189	* Don't include locally predictive attributes.</pre>
	190	*
	191	<!-- options-end -->
	192	*
	193	* @param options the list of options as an array of strings
	194	* @throws Exception if an option is not supported
	195	*
	196	**/
	197	public void setOptions (String[] options)
	198	throws Exception {
	199
	200	resetOptions();
	201	setMissingSeparate(Utils.getFlag('M', options));
	202	setLocallyPredictive(!Utils.getFlag('L', options));
	203	}
	204
	205	/**
	206	* Returns the tip text for this property
	207	* @return tip text for this property suitable for
	208	* displaying in the explorer/experimenter gui
	209	*/
	210	public String locallyPredictiveTipText() {
	211	return "Identify locally predictive attributes. Iteratively adds "
	212	+"attributes with the highest correlation with the class as long "
	213	+"as there is not already an attribute in the subset that has a "
	214	+"higher correlation with the attribute in question";
	215	}
	216
	217	/**
	218	* Include locally predictive attributes
	219	*
	220	* @param b true or false
	221	*/
	222	public void setLocallyPredictive (boolean b) {
	223	m_locallyPredictive = b;
	224	}
	225
	226
	227	/**
	228	* Return true if including locally predictive attributes
	229	*
	230	* @return true if locally predictive attributes are to be used
	231	*/
	232	public boolean getLocallyPredictive () {
	233	return m_locallyPredictive;
	234	}
	235
	236	/**
	237	* Returns the tip text for this property
	238	* @return tip text for this property suitable for
	239	* displaying in the explorer/experimenter gui
	240	*/
	241	public String missingSeparateTipText() {
	242	return "Treat missing as a separate value. Otherwise, counts for missing "
	243	+"values are distributed across other values in proportion to their "
	244	+"frequency.";
	245	}
	246
	247	/**
	248	* Treat missing as a separate value
	249	*
	250	* @param b true or false
	251	*/
	252	public void setMissingSeparate (boolean b) {
	253	m_missingSeparate = b;
	254	}
	255
	256
	257	/**
	258	* Return true is missing is treated as a separate value
	259	*
	260	* @return true if missing is to be treated as a separate value
	261	*/
	262	public boolean getMissingSeparate () {
	263	return m_missingSeparate;
	264	}
	265
	266
	267	/**
	268	* Gets the current settings of CfsSubsetEval
	269	*
	270	* @return an array of strings suitable for passing to setOptions()
	271	*/
	272	public String[] getOptions () {
	273	String[] options = new String[2];
	274	int current = 0;
	275
	276	if (getMissingSeparate()) {
	277	options[current++] = "-M";
	278	}
	279
	280	if (!getLocallyPredictive()) {
	281	options[current++] = "-L";
	282	}
	283
	284	while (current < options.length) {
	285	options[current++] = "";
	286	}
	287
	288	return options;
	289	}
	290
	291	/**
	292	* Returns the capabilities of this evaluator.
	293	*
	294	* @return the capabilities of this evaluator
	295	* @see Capabilities
	296	*/
	297	public Capabilities getCapabilities() {
	298	Capabilities result = super.getCapabilities();
	299	result.disableAll();
	300
	301	// attributes
	302	result.enable(Capability.NOMINAL_ATTRIBUTES);
	303	result.enable(Capability.NUMERIC_ATTRIBUTES);
	304	result.enable(Capability.DATE_ATTRIBUTES);
	305	result.enable(Capability.MISSING_VALUES);
	306
	307	// class
	308	result.enable(Capability.NOMINAL_CLASS);
	309	result.enable(Capability.NUMERIC_CLASS);
	310	result.enable(Capability.DATE_CLASS);
	311	result.enable(Capability.MISSING_CLASS_VALUES);
	312
	313	return result;
	314	}
	315
	316	/**
	317	* Generates a attribute evaluator. Has to initialize all fields of the
	318	* evaluator that are not being set via options.
	319	*
	320	* CFS also discretises attributes (if necessary) and initializes
	321	* the correlation matrix.
	322	*
	323	* @param data set of instances serving as training data
	324	* @throws Exception if the evaluator has not been
	325	* generated successfully
	326	*/
	327	public void buildEvaluator (Instances data)
	328	throws Exception {
	329
	330	// can evaluator handle data?
	331	getCapabilities().testWithFail(data);
	332
	333	m_trainInstances = new Instances(data);
	334	m_trainInstances.deleteWithMissingClass();
	335	m_classIndex = m_trainInstances.classIndex();
	336	m_numAttribs = m_trainInstances.numAttributes();
	337	m_numInstances = m_trainInstances.numInstances();
	338	m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();
	339
	340	if (!m_isNumeric) {
	341	m_disTransform = new Discretize();
	342	m_disTransform.setUseBetterEncoding(true);
	343	m_disTransform.setInputFormat(m_trainInstances);
	344	m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
	345	}
	346
	347	m_std_devs = new double[m_numAttribs];
	348	m_corr_matrix = new float [m_numAttribs][];
	349	for (int i = 0; i < m_numAttribs; i++) {
	350	m_corr_matrix[i] = new float [i+1];
	351	}
	352
	353	for (int i = 0; i < m_corr_matrix.length; i++) {
	354	m_corr_matrix[i][i] = 1.0f;
	355	m_std_devs[i] = 1.0;
	356	}
	357
	358	for (int i = 0; i < m_numAttribs; i++) {
	359	for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
	360	m_corr_matrix[i][j] = -999;
	361	}
	362	}
	363	}
	364
	365
	366	/**
	367	* evaluates a subset of attributes
	368	*
	369	* @param subset a bitset representing the attribute subset to be
	370	* evaluated
	371	* @return the merit
	372	* @throws Exception if the subset could not be evaluated
	373	*/
	374	public double evaluateSubset (BitSet subset)
	375	throws Exception {
	376	double num = 0.0;
	377	double denom = 0.0;
	378	float corr;
	379	int larger, smaller;
	380	// do numerator
	381	for (int i = 0; i < m_numAttribs; i++) {
	382	if (i != m_classIndex) {
	383	if (subset.get(i)) {
	384	if (i > m_classIndex) {
	385	larger = i; smaller = m_classIndex;
	386	} else {
	387	smaller = i; larger = m_classIndex;
	388	}
	389	/* int larger = (i > m_classIndex ? i : m_classIndex);
	390	int smaller = (i > m_classIndex ? m_classIndex : i); */
	391	if (m_corr_matrix[larger][smaller] == -999) {
	392	corr = correlate(i, m_classIndex);
	393	m_corr_matrix[larger][smaller] = corr;
	394	num += (m_std_devs[i] * corr);
	395	}
	396	else {
	397	num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);
	398	}
	399	}
	400	}
	401	}
	402
	403	// do denominator
	404	for (int i = 0; i < m_numAttribs; i++) {
	405	if (i != m_classIndex) {
	406	if (subset.get(i)) {
	407	denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
	408
	409	for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
	410	if (subset.get(j)) {
	411	if (m_corr_matrix[i][j] == -999) {
	412	corr = correlate(i, j);
	413	m_corr_matrix[i][j] = corr;
	414	denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
	415	}
	416	else {
	417	denom += (2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);
	418	}
	419	}
	420	}
	421	}
	422	}
	423	}
	424
	425	if (denom < 0.0) {
	426	denom *= -1.0;
	427	}
	428
	429	if (denom == 0.0) {
	430	return (0.0);
	431	}
	432
	433	double merit = (num/Math.sqrt(denom));
	434
	435	if (merit < 0.0) {
	436	merit *= -1.0;
	437	}
	438
	439	return merit;
	440	}
	441
	442
	443	private float correlate (int att1, int att2) {
	444	if (!m_isNumeric) {
	445	return (float) symmUncertCorr(att1, att2);
	446	}
	447
	448	boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
	449	boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
	450
	451	if (att1_is_num && att2_is_num) {
	452	return (float) num_num(att1, att2);
	453	}
	454	else {if (att2_is_num) {
	455	return (float) num_nom2(att1, att2);
	456	}
	457	else {if (att1_is_num) {
	458	return (float) num_nom2(att2, att1);
	459	}
	460	}
	461	}
	462
	463	return (float) nom_nom(att1, att2);
	464	}
	465
	466
	467	private double symmUncertCorr (int att1, int att2) {
	468	int i, j, k, ii, jj;
	469	int ni, nj;
	470	double sum = 0.0;
	471	double sumi[], sumj[];
	472	double counts[][];
	473	Instance inst;
	474	double corr_measure;
	475	boolean flag = false;
	476	double temp = 0.0;
	477
	478	if (att1 == m_classIndex \|\| att2 == m_classIndex) {
	479	flag = true;
	480	}
	481
	482	ni = m_trainInstances.attribute(att1).numValues() + 1;
	483	nj = m_trainInstances.attribute(att2).numValues() + 1;
	484	counts = new double[ni][nj];
	485	sumi = new double[ni];
	486	sumj = new double[nj];
	487
	488	for (i = 0; i < ni; i++) {
	489	sumi[i] = 0.0;
	490
	491	for (j = 0; j < nj; j++) {
	492	sumj[j] = 0.0;
	493	counts[i][j] = 0.0;
	494	}
	495	}
	496
	497	// Fill the contingency table
	498	for (i = 0; i < m_numInstances; i++) {
	499	inst = m_trainInstances.instance(i);
	500
	501	if (inst.isMissing(att1)) {
	502	ii = ni - 1;
	503	}
	504	else {
	505	ii = (int)inst.value(att1);
	506	}
	507
	508	if (inst.isMissing(att2)) {
	509	jj = nj - 1;
	510	}
	511	else {
	512	jj = (int)inst.value(att2);
	513	}
	514
	515	counts[ii][jj]++;
	516	}
	517
	518	// get the row totals
	519	for (i = 0; i < ni; i++) {
	520	sumi[i] = 0.0;
	521
	522	for (j = 0; j < nj; j++) {
	523	sumi[i] += counts[i][j];
	524	sum += counts[i][j];
	525	}
	526	}
	527
	528	// get the column totals
	529	for (j = 0; j < nj; j++) {
	530	sumj[j] = 0.0;
	531
	532	for (i = 0; i < ni; i++) {
	533	sumj[j] += counts[i][j];
	534	}
	535	}
	536
	537	// distribute missing counts
	538	if (!m_missingSeparate &&
	539	(sumi[ni-1] < m_numInstances) &&
	540	(sumj[nj-1] < m_numInstances)) {
	541	double[] i_copy = new double[sumi.length];
	542	double[] j_copy = new double[sumj.length];
	543	double[][] counts_copy = new double[sumi.length][sumj.length];
	544
	545	for (i = 0; i < ni; i++) {
	546	System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
	547	}
	548
	549	System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
	550	System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
	551	double total_missing =
	552	(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
	553
	554	// do the missing i's
	555	if (sumi[ni - 1] > 0.0) {
	556	for (j = 0; j < nj - 1; j++) {
	557	if (counts[ni - 1][j] > 0.0) {
	558	for (i = 0; i < ni - 1; i++) {
	559	temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
	560	counts[i][j] += temp;
	561	sumi[i] += temp;
	562	}
	563
	564	counts[ni - 1][j] = 0.0;
	565	}
	566	}
	567	}
	568
	569	sumi[ni - 1] = 0.0;
	570
	571	// do the missing j's
	572	if (sumj[nj - 1] > 0.0) {
	573	for (i = 0; i < ni - 1; i++) {
	574	if (counts[i][nj - 1] > 0.0) {
	575	for (j = 0; j < nj - 1; j++) {
	576	temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
	577	counts[i][j] += temp;
	578	sumj[j] += temp;
	579	}
	580
	581	counts[i][nj - 1] = 0.0;
	582	}
	583	}
	584	}
	585
	586	sumj[nj - 1] = 0.0;
	587
	588	// do the both missing
	589	if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
	590	for (i = 0; i < ni - 1; i++) {
	591	for (j = 0; j < nj - 1; j++) {
	592	temp = (counts_copy[i][j]/(sum - total_missing)) *
	593	counts_copy[ni - 1][nj - 1];
	594
	595	counts[i][j] += temp;
	596	sumi[i] += temp;
	597	sumj[j] += temp;
	598	}
	599	}
	600
	601	counts[ni - 1][nj - 1] = 0.0;
	602	}
	603	}
	604
	605	corr_measure = ContingencyTables.symmetricalUncertainty(counts);
	606
	607	if (Utils.eq(corr_measure, 0.0)) {
	608	if (flag == true) {
	609	return (0.0);
	610	}
	611	else {
	612	return (1.0);
	613	}
	614	}
	615	else {
	616	return (corr_measure);
	617	}
	618	}
	619
	620
	621	private double num_num (int att1, int att2) {
	622	int i;
	623	Instance inst;
	624	double r, diff1, diff2, num = 0.0, sx = 0.0, sy = 0.0;
	625	double mx = m_trainInstances.meanOrMode(m_trainInstances.attribute(att1));
	626	double my = m_trainInstances.meanOrMode(m_trainInstances.attribute(att2));
	627
	628	for (i = 0; i < m_numInstances; i++) {
	629	inst = m_trainInstances.instance(i);
	630	diff1 = (inst.isMissing(att1))? 0.0 : (inst.value(att1) - mx);
	631	diff2 = (inst.isMissing(att2))? 0.0 : (inst.value(att2) - my);
	632	num += (diff1*diff2);
	633	sx += (diff1*diff1);
	634	sy += (diff2*diff2);
	635	}
	636
	637	if (sx != 0.0) {
	638	if (m_std_devs[att1] == 1.0) {
	639	m_std_devs[att1] = Math.sqrt((sx/m_numInstances));
	640	}
	641	}
	642
	643	if (sy != 0.0) {
	644	if (m_std_devs[att2] == 1.0) {
	645	m_std_devs[att2] = Math.sqrt((sy/m_numInstances));
	646	}
	647	}
	648
	649	if ((sx*sy) > 0.0) {
	650	r = (num/(Math.sqrt(sx*sy)));
	651	return ((r < 0.0)? -r : r);
	652	}
	653	else {
	654	if (att1 != m_classIndex && att2 != m_classIndex) {
	655	return 1.0;
	656	}
	657	else {
	658	return 0.0;
	659	}
	660	}
	661	}
	662
	663
	664	private double num_nom2 (int att1, int att2) {
	665	int i, ii, k;
	666	double temp;
	667	Instance inst;
	668	int mx = (int)m_trainInstances.
	669	meanOrMode(m_trainInstances.attribute(att1));
	670	double my = m_trainInstances.
	671	meanOrMode(m_trainInstances.attribute(att2));
	672	double stdv_num = 0.0;
	673	double diff1, diff2;
	674	double r = 0.0, rr;
	675	int nx = (!m_missingSeparate)
	676	? m_trainInstances.attribute(att1).numValues()
	677	: m_trainInstances.attribute(att1).numValues() + 1;
	678
	679	double[] prior_nom = new double[nx];
	680	double[] stdvs_nom = new double[nx];
	681	double[] covs = new double[nx];
	682
	683	for (i = 0; i < nx; i++) {
	684	stdvs_nom[i] = covs[i] = prior_nom[i] = 0.0;
	685	}
	686
	687	// calculate frequencies (and means) of the values of the nominal
	688	// attribute
	689	for (i = 0; i < m_numInstances; i++) {
	690	inst = m_trainInstances.instance(i);
	691
	692	if (inst.isMissing(att1)) {
	693	if (!m_missingSeparate) {
	694	ii = mx;
	695	}
	696	else {
	697	ii = nx - 1;
	698	}
	699	}
	700	else {
	701	ii = (int)inst.value(att1);
	702	}
	703
	704	// increment freq for nominal
	705	prior_nom[ii]++;
	706	}
	707
	708	for (k = 0; k < m_numInstances; k++) {
	709	inst = m_trainInstances.instance(k);
	710	// std dev of numeric attribute
	711	diff2 = (inst.isMissing(att2))? 0.0 : (inst.value(att2) - my);
	712	stdv_num += (diff2*diff2);
	713
	714	//
	715	for (i = 0; i < nx; i++) {
	716	if (inst.isMissing(att1)) {
	717	if (!m_missingSeparate) {
	718	temp = (i == mx)? 1.0 : 0.0;
	719	}
	720	else {
	721	temp = (i == (nx - 1))? 1.0 : 0.0;
	722	}
	723	}
	724	else {
	725	temp = (i == inst.value(att1))? 1.0 : 0.0;
	726	}
	727
	728	diff1 = (temp - (prior_nom[i]/m_numInstances));
	729	stdvs_nom[i] += (diff1*diff1);
	730	covs[i] += (diff1*diff2);
	731	}
	732	}
	733
	734	// calculate weighted correlation
	735	for (i = 0, temp = 0.0; i < nx; i++) {
	736	// calculate the weighted variance of the nominal
	737	temp += ((prior_nom[i]/m_numInstances)*(stdvs_nom[i]/m_numInstances));
	738
	739	if ((stdvs_nom[i]*stdv_num) > 0.0) {
	740	//System.out.println("Stdv :"+stdvs_nom[i]);
	741	rr = (covs[i]/(Math.sqrt(stdvs_nom[i]*stdv_num)));
	742
	743	if (rr < 0.0) {
	744	rr = -rr;
	745	}
	746
	747	r += ((prior_nom[i]/m_numInstances)*rr);
	748	}
	749	/* if there is zero variance for the numeric att at a specific
	750	level of the catergorical att then if neither is the class then
	751	make this correlation at this level maximally bad i.e. 1.0.
	752	If either is the class then maximally bad correlation is 0.0 */
	753	else {if (att1 != m_classIndex && att2 != m_classIndex) {
	754	r += ((prior_nom[i]/m_numInstances)*1.0);
	755	}
	756	}
	757	}
	758
	759	// set the standard deviations for these attributes if necessary
	760	// if ((att1 != classIndex) && (att2 != classIndex)) // =============
	761	if (temp != 0.0) {
	762	if (m_std_devs[att1] == 1.0) {
	763	m_std_devs[att1] = Math.sqrt(temp);
	764	}
	765	}
	766
	767	if (stdv_num != 0.0) {
	768	if (m_std_devs[att2] == 1.0) {
	769	m_std_devs[att2] = Math.sqrt((stdv_num/m_numInstances));
	770	}
	771	}
	772
	773	if (r == 0.0) {
	774	if (att1 != m_classIndex && att2 != m_classIndex) {
	775	r = 1.0;
	776	}
	777	}
	778
	779	return r;
	780	}
	781
	782
	783	private double nom_nom (int att1, int att2) {
	784	int i, j, ii, jj, z;
	785	double temp1, temp2;
	786	Instance inst;
	787	int mx = (int)m_trainInstances.
	788	meanOrMode(m_trainInstances.attribute(att1));
	789	int my = (int)m_trainInstances.
	790	meanOrMode(m_trainInstances.attribute(att2));
	791	double diff1, diff2;
	792	double r = 0.0, rr;
	793	int nx = (!m_missingSeparate)
	794	? m_trainInstances.attribute(att1).numValues()
	795	: m_trainInstances.attribute(att1).numValues() + 1;
	796
	797	int ny = (!m_missingSeparate)
	798	? m_trainInstances.attribute(att2).numValues()
	799	: m_trainInstances.attribute(att2).numValues() + 1;
	800
	801	double[][] prior_nom = new double[nx][ny];
	802	double[] sumx = new double[nx];
	803	double[] sumy = new double[ny];
	804	double[] stdvsx = new double[nx];
	805	double[] stdvsy = new double[ny];
	806	double[][] covs = new double[nx][ny];
	807
	808	for (i = 0; i < nx; i++) {
	809	sumx[i] = stdvsx[i] = 0.0;
	810	}
	811
	812	for (j = 0; j < ny; j++) {
	813	sumy[j] = stdvsy[j] = 0.0;
	814	}
	815
	816	for (i = 0; i < nx; i++) {
	817	for (j = 0; j < ny; j++) {
	818	covs[i][j] = prior_nom[i][j] = 0.0;
	819	}
	820	}
	821
	822	// calculate frequencies (and means) of the values of the nominal
	823	// attribute
	824	for (i = 0; i < m_numInstances; i++) {
	825	inst = m_trainInstances.instance(i);
	826
	827	if (inst.isMissing(att1)) {
	828	if (!m_missingSeparate) {
	829	ii = mx;
	830	}
	831	else {
	832	ii = nx - 1;
	833	}
	834	}
	835	else {
	836	ii = (int)inst.value(att1);
	837	}
	838
	839	if (inst.isMissing(att2)) {
	840	if (!m_missingSeparate) {
	841	jj = my;
	842	}
	843	else {
	844	jj = ny - 1;
	845	}
	846	}
	847	else {
	848	jj = (int)inst.value(att2);
	849	}
	850
	851	// increment freq for nominal
	852	prior_nom[ii][jj]++;
	853	sumx[ii]++;
	854	sumy[jj]++;
	855	}
	856
	857	for (z = 0; z < m_numInstances; z++) {
	858	inst = m_trainInstances.instance(z);
	859
	860	for (j = 0; j < ny; j++) {
	861	if (inst.isMissing(att2)) {
	862	if (!m_missingSeparate) {
	863	temp2 = (j == my)? 1.0 : 0.0;
	864	}
	865	else {
	866	temp2 = (j == (ny - 1))? 1.0 : 0.0;
	867	}
	868	}
	869	else {
	870	temp2 = (j == inst.value(att2))? 1.0 : 0.0;
	871	}
	872
	873	diff2 = (temp2 - (sumy[j]/m_numInstances));
	874	stdvsy[j] += (diff2*diff2);
	875	}
	876
	877	//
	878	for (i = 0; i < nx; i++) {
	879	if (inst.isMissing(att1)) {
	880	if (!m_missingSeparate) {
	881	temp1 = (i == mx)? 1.0 : 0.0;
	882	}
	883	else {
	884	temp1 = (i == (nx - 1))? 1.0 : 0.0;
	885	}
	886	}
	887	else {
	888	temp1 = (i == inst.value(att1))? 1.0 : 0.0;
	889	}
	890
	891	diff1 = (temp1 - (sumx[i]/m_numInstances));
	892	stdvsx[i] += (diff1*diff1);
	893
	894	for (j = 0; j < ny; j++) {
	895	if (inst.isMissing(att2)) {
	896	if (!m_missingSeparate) {
	897	temp2 = (j == my)? 1.0 : 0.0;
	898	}
	899	else {
	900	temp2 = (j == (ny - 1))? 1.0 : 0.0;
	901	}
	902	}
	903	else {
	904	temp2 = (j == inst.value(att2))? 1.0 : 0.0;
	905	}
	906
	907	diff2 = (temp2 - (sumy[j]/m_numInstances));
	908	covs[i][j] += (diff1*diff2);
	909	}
	910	}
	911	}
	912
	913	// calculate weighted correlation
	914	for (i = 0; i < nx; i++) {
	915	for (j = 0; j < ny; j++) {
	916	if ((stdvsx[i]*stdvsy[j]) > 0.0) {
	917	//System.out.println("Stdv :"+stdvs_nom[i]);
	918	rr = (covs[i][j]/(Math.sqrt(stdvsx[i]*stdvsy[j])));
	919
	920	if (rr < 0.0) {
	921	rr = -rr;
	922	}
	923
	924	r += ((prior_nom[i][j]/m_numInstances)*rr);
	925	}
	926	// if there is zero variance for either of the categorical atts then if
	927	// neither is the class then make this
	928	// correlation at this level maximally bad i.e. 1.0. If either is
	929	// the class then maximally bad correlation is 0.0
	930	else {if (att1 != m_classIndex && att2 != m_classIndex) {
	931	r += ((prior_nom[i][j]/m_numInstances)*1.0);
	932	}
	933	}
	934	}
	935	}
	936
	937	// calculate weighted standard deviations for these attributes
	938	// (if necessary)
	939	for (i = 0, temp1 = 0.0; i < nx; i++) {
	940	temp1 += ((sumx[i]/m_numInstances)*(stdvsx[i]/m_numInstances));
	941	}
	942
	943	if (temp1 != 0.0) {
	944	if (m_std_devs[att1] == 1.0) {
	945	m_std_devs[att1] = Math.sqrt(temp1);
	946	}
	947	}
	948
	949	for (j = 0, temp2 = 0.0; j < ny; j++) {
	950	temp2 += ((sumy[j]/m_numInstances)*(stdvsy[j]/m_numInstances));
	951	}
	952
	953	if (temp2 != 0.0) {
	954	if (m_std_devs[att2] == 1.0) {
	955	m_std_devs[att2] = Math.sqrt(temp2);
	956	}
	957	}
	958
	959	if (r == 0.0) {
	960	if (att1 != m_classIndex && att2 != m_classIndex) {
	961	r = 1.0;
	962	}
	963	}
	964
	965	return r;
	966	}
	967
	968
	969	/**
	970	* returns a string describing CFS
	971	*
	972	* @return the description as a string
	973	*/
	974	public String toString () {
	975	StringBuffer text = new StringBuffer();
	976
	977	if (m_trainInstances == null) {
	978	text.append("CFS subset evaluator has not been built yet\n");
	979	}
	980	else {
	981	text.append("\tCFS Subset Evaluator\n");
	982
	983	if (m_missingSeparate) {
	984	text.append("\tTreating missing values as a separate value\n");
	985	}
	986
	987	if (m_locallyPredictive) {
	988	text.append("\tIncluding locally predictive attributes\n");
	989	}
	990	}
	991
	992	return text.toString();
	993	}
	994
	995
	996	private void addLocallyPredictive (BitSet best_group) {
	997	int i, j;
	998	boolean done = false;
	999	boolean ok = true;
	1000	double temp_best = -1.0;
	1001	float corr;
	1002	j = 0;
	1003	BitSet temp_group = (BitSet)best_group.clone();
	1004	int larger, smaller;
	1005
	1006	while (!done) {
	1007	temp_best = -1.0;
	1008
	1009	// find best not already in group
	1010	for (i = 0; i < m_numAttribs; i++) {
	1011	if (i > m_classIndex) {
	1012	larger = i; smaller = m_classIndex;
	1013	} else {
	1014	smaller = i; larger = m_classIndex;
	1015	}
	1016	/* int larger = (i > m_classIndex ? i : m_classIndex);
	1017	int smaller = (i > m_classIndex ? m_classIndex : i); */
	1018	if ((!temp_group.get(i)) && (i != m_classIndex)) {
	1019	if (m_corr_matrix[larger][smaller] == -999) {
	1020	corr = correlate(i, m_classIndex);
	1021	m_corr_matrix[larger][smaller] = corr;
	1022	}
	1023
	1024	if (m_corr_matrix[larger][smaller] > temp_best) {
	1025	temp_best = m_corr_matrix[larger][smaller];
	1026	j = i;
	1027	}
	1028	}
	1029	}
	1030
	1031	if (temp_best == -1.0) {
	1032	done = true;
	1033	}
	1034	else {
	1035	ok = true;
	1036	temp_group.set(j);
	1037
	1038	// check the best against correlations with others already
	1039	// in group
	1040	for (i = 0; i < m_numAttribs; i++) {
	1041	if (i > j) {
	1042	larger = i; smaller = j;
	1043	} else {
	1044	larger = j; smaller = i;
	1045	}
	1046	/* int larger = (i > j ? i : j);
	1047	int smaller = (i > j ? j : i); */
	1048	if (best_group.get(i)) {
	1049	if (m_corr_matrix[larger][smaller] == -999) {
	1050	corr = correlate(i, j);
	1051	m_corr_matrix[larger][smaller] = corr;
	1052	}
	1053
	1054	if (m_corr_matrix[larger][smaller] > temp_best - m_c_Threshold) {
	1055	ok = false;
	1056	break;
	1057	}
	1058	}
	1059	}
	1060
	1061	// if ok then add to best_group
	1062	if (ok) {
	1063	best_group.set(j);
	1064	}
	1065	}
	1066	}
	1067	}
	1068
	1069
	1070	/**
	1071	* Calls locallyPredictive in order to include locally predictive
	1072	* attributes (if requested).
	1073	*
	1074	* @param attributeSet the set of attributes found by the search
	1075	* @return a possibly ranked list of postprocessed attributes
	1076	* @throws Exception if postprocessing fails for some reason
	1077	*/
	1078	public int[] postProcess (int[] attributeSet)
	1079	throws Exception {
	1080	int j = 0;
	1081
	1082	if (!m_locallyPredictive) {
	1083	// m_trainInstances = new Instances(m_trainInstances,0);
	1084	return attributeSet;
	1085	}
	1086
	1087	BitSet bestGroup = new BitSet(m_numAttribs);
	1088
	1089	for (int i = 0; i < attributeSet.length; i++) {
	1090	bestGroup.set(attributeSet[i]);
	1091	}
	1092
	1093	addLocallyPredictive(bestGroup);
	1094
	1095	// count how many are set
	1096	for (int i = 0; i < m_numAttribs; i++) {
	1097	if (bestGroup.get(i)) {
	1098	j++;
	1099	}
	1100	}
	1101
	1102	int[] newSet = new int[j];
	1103	j = 0;
	1104
	1105	for (int i = 0; i < m_numAttribs; i++) {
	1106	if (bestGroup.get(i)) {
	1107	newSet[j++] = i;
	1108	}
	1109	}
	1110
	1111	// m_trainInstances = new Instances(m_trainInstances,0);
	1112	return newSet;
	1113	}
	1114
	1115
	1116	protected void resetOptions () {
	1117	m_trainInstances = null;
	1118	m_missingSeparate = false;
	1119	m_locallyPredictive = true;
	1120	m_c_Threshold = 0.0;
	1121	}
	1122
	1123	/**
	1124	* Returns the revision string.
	1125	*
	1126	* @return the revision
	1127	*/
	1128	public String getRevision() {
	1129	return RevisionUtils.extract("$Revision: 6132 $");
	1130	}
	1131
	1132	/**
	1133	* Main method for testing this class.
	1134	*
	1135	* @param args the options
	1136	*/
	1137	public static void main (String[] args) {
	1138	runEvaluator(new CfsSubsetEval(), args);
	1139	}
	1140	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/attributeSelection/CfsSubsetEval.java

Download in other formats: