Context Navigation

source: src/main/java/weka/attributeSelection/RaceSearch.java @ 21

Last change on this file since 21 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 61.0 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* RaceSearch.java
	19	* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.attributeSelection;
	24
	25	import weka.core.Instance;
	26	import weka.core.Instances;
	27	import weka.core.Option;
	28	import weka.core.OptionHandler;
	29	import weka.core.RevisionUtils;
	30	import weka.core.SelectedTag;
	31	import weka.core.Statistics;
	32	import weka.core.Tag;
	33	import weka.core.TechnicalInformation;
	34	import weka.core.TechnicalInformation.Type;
	35	import weka.core.TechnicalInformation.Field;
	36	import weka.core.TechnicalInformationHandler;
	37	import weka.core.Utils;
	38	import weka.experiment.PairedStats;
	39	import weka.experiment.Stats;
	40
	41	import java.util.BitSet;
	42	import java.util.Enumeration;
	43	import java.util.Random;
	44	import java.util.Vector;
	45
	46	/**
	47	<!-- globalinfo-start -->
	48	* Races the cross validation error of competing attribute subsets. Use in conjuction with a ClassifierSubsetEval. RaceSearch has four modes:<br/>
	49	* <br/>
	50	* forward selection races all single attribute additions to a base set (initially no attributes), selects the winner to become the new base set and then iterates until there is no improvement over the base set. <br/>
	51	* <br/>
	52	* Backward elimination is similar but the initial base set has all attributes included and races all single attribute deletions. <br/>
	53	* <br/>
	54	* Schemata search is a bit different. Each iteration a series of races are run in parallel. Each race in a set determines whether a particular attribute should be included or not---ie the race is between the attribute being "in" or "out". The other attributes for this race are included or excluded randomly at each point in the evaluation. As soon as one race has a clear winner (ie it has been decided whether a particular attribute should be inor not) then the next set of races begins, using the result of the winning race from the previous iteration as new base set.<br/>
	55	* <br/>
	56	* Rank race first ranks the attributes using an attribute evaluator and then races the ranking. The race includes no attributes, the top ranked attribute, the top two attributes, the top three attributes, etc.<br/>
	57	* <br/>
	58	* It is also possible to generate a raked list of attributes through the forward racing process. If generateRanking is set to true then a complete forward race will be run---that is, racing continues until all attributes have been selected. The order that they are added in determines a complete ranking of all the attributes.<br/>
	59	* <br/>
	60	* Racing uses paired and unpaired t-tests on cross-validation errors of competing subsets. When there is a significant difference between the means of the errors of two competing subsets then the poorer of the two can be eliminated from the race. Similarly, if there is no significant difference between the mean errors of two competing subsets and they are within some threshold of each other, then one can be eliminated from the race.<br/>
	61	* <br/>
	62	* For more information see:<br/>
	63	* <br/>
	64	* Andrew W. Moore, Mary S. Lee: Efficient Algorithms for Minimizing Cross Validation Error. In: Eleventh International Conference on Machine Learning, 190-198, 1994.
	65	* <p/>
	66	<!-- globalinfo-end -->
	67	*
	68	<!-- technical-bibtex-start -->
	69	* BibTeX:
	70	* <pre>
	71	* @inproceedings{Moore1994,
	72	* author = {Andrew W. Moore and Mary S. Lee},
	73	* booktitle = {Eleventh International Conference on Machine Learning},
	74	* pages = {190-198},
	75	* publisher = {Morgan Kaufmann},
	76	* title = {Efficient Algorithms for Minimizing Cross Validation Error},
	77	* year = {1994}
	78	* }
	79	* </pre>
	80	* <p/>
	81	<!-- technical-bibtex-end -->
	82	*
	83	<!-- options-start -->
	84	* Valid options are: <p/>
	85	*
	86	* <pre> -R <0 = forward \| 1 = backward race \| 2 = schemata \| 3 = rank>
	87	* Type of race to perform.
	88	* (default = 0).</pre>
	89	*
	90	* <pre> -L <significance>
	91	* Significance level for comaparisons
	92	* (default = 0.001(forward/backward/rank)/0.01(schemata)).</pre>
	93	*
	94	* <pre> -T <threshold>
	95	* Threshold for error comparison.
	96	* (default = 0.001).</pre>
	97	*
	98	* <pre> -A <attribute evaluator>
	99	* Attribute ranker to use if doing a
	100	* rank search. Place any
	101	* evaluator options LAST on
	102	* the command line following a "--".
	103	* eg. -A weka.attributeSelection.GainRatioAttributeEval ... -- -M.
	104	* (default = GainRatioAttributeEval)</pre>
	105	*
	106	* <pre> -F <0 = 10 fold \| 1 = leave-one-out>
	107	* Folds for cross validation
	108	* (default = 0 (1 if schemata race)</pre>
	109	*
	110	* <pre> -Q
	111	* Generate a ranked list of attributes.
	112	* Forces the search to be forward
	113	* and races until all attributes have
	114	* selected, thus producing a ranking.</pre>
	115	*
	116	* <pre> -N <num to select>
	117	* Specify number of attributes to retain from
	118	* the ranking. Overides -T. Use in conjunction with -Q</pre>
	119	*
	120	* <pre> -J <threshold>
	121	* Specify a theshold by which attributes
	122	* may be discarded from the ranking.
	123	* Use in conjuction with -Q</pre>
	124	*
	125	* <pre> -Z
	126	* Verbose output for monitoring the search.</pre>
	127	*
	128	* <pre>
	129	* Options specific to evaluator weka.attributeSelection.GainRatioAttributeEval:
	130	* </pre>
	131	*
	132	* <pre> -M
	133	* treat missing values as a seperate value.</pre>
	134	*
	135	<!-- options-end -->
	136	*
	137	* @author Mark Hall (mhall@cs.waikato.ac.nz)
	138	* @version $Revision: 1.26 $
	139	*/
	140	public class RaceSearch
	141	extends ASSearch
	142	implements RankedOutputSearch, OptionHandler, TechnicalInformationHandler {
	143
	144	/** for serialization */
	145	static final long serialVersionUID = 4015453851212985720L;
	146
	147	/** the training instances */
	148	private Instances m_Instances = null;
	149
	150	/** search types */
	151	private static final int FORWARD_RACE = 0;
	152	private static final int BACKWARD_RACE = 1;
	153	private static final int SCHEMATA_RACE = 2;
	154	private static final int RANK_RACE = 3;
	155	public static final Tag [] TAGS_SELECTION = {
	156	new Tag(FORWARD_RACE, "Forward selection race"),
	157	new Tag(BACKWARD_RACE, "Backward elimination race"),
	158	new Tag(SCHEMATA_RACE, "Schemata race"),
	159	new Tag(RANK_RACE, "Rank race")
	160	};
	161
	162	/** the selected search type */
	163	private int m_raceType = FORWARD_RACE;
	164
	165	/** xval types */
	166	private static final int TEN_FOLD = 0;
	167	private static final int LEAVE_ONE_OUT = 1;
	168	public static final Tag [] XVALTAGS_SELECTION = {
	169	new Tag(TEN_FOLD, "10 Fold"),
	170	new Tag(LEAVE_ONE_OUT, "Leave-one-out"),
	171	};
	172
	173	/** the selected xval type */
	174	private int m_xvalType = TEN_FOLD;
	175
	176	/** the class index */
	177	private int m_classIndex;
	178
	179	/** the number of attributes in the data */
	180	private int m_numAttribs;
	181
	182	/** the total number of partially/fully evaluated subsets */
	183	private int m_totalEvals;
	184
	185	/** holds the merit of the best subset found */
	186	private double m_bestMerit = -Double.MAX_VALUE;
	187
	188	/** the subset evaluator to use */
	189	private HoldOutSubsetEvaluator m_theEvaluator = null;
	190
	191	/** the significance level for comparisons */
	192	private double m_sigLevel = 0.001;
	193
	194	/** threshold for comparisons */
	195	private double m_delta = 0.001;
	196
	197	/** the number of samples above which to begin testing for similarity
	198	between competing subsets */
	199	private int m_samples = 20;
	200
	201	/** number of cross validation folds---equal to the number of instances
	202	for leave-one-out cv */
	203	private int m_numFolds = 10;
	204
	205	/** the attribute evaluator to generate the initial ranking when
	206	doing a rank race */
	207	private ASEvaluation m_ASEval = new GainRatioAttributeEval();
	208
	209	/** will hold the attribute ranking produced by the above attribute
	210	evaluator if doing a rank search */
	211	private int [] m_Ranking;
	212
	213	/** verbose output for monitoring the search and debugging */
	214	private boolean m_debug = false;
	215
	216	/** If true then produce a ranked list of attributes by fully traversing
	217	a forward hillclimb race */
	218	private boolean m_rankingRequested = false;
	219
	220	/** The ranked list of attributes produced if m_rankingRequested is true */
	221	private double [][] m_rankedAtts;
	222
	223	/** The number of attributes ranked so far (if ranking is requested) */
	224	private int m_rankedSoFar;
	225
	226	/** The number of attributes to retain if a ranking is requested. -1
	227	indicates that all attributes are to be retained. Has precedence over
	228	m_threshold */
	229	private int m_numToSelect = -1;
	230
	231	private int m_calculatedNumToSelect = -1;
	232
	233	/** the threshold for removing attributes if ranking is requested */
	234	private double m_threshold = -Double.MAX_VALUE;
	235
	236	/**
	237	* Returns a string describing this search method
	238	* @return a description of the search method suitable for
	239	* displaying in the explorer/experimenter gui
	240	*/
	241	public String globalInfo() {
	242	return "Races the cross validation error of competing "
	243	+"attribute subsets. Use in conjuction with a ClassifierSubsetEval. "
	244	+"RaceSearch has four modes:\n\nforward selection "
	245	+"races all single attribute additions to a base set (initially "
	246	+" no attributes), selects the winner to become the new base set "
	247	+"and then iterates until there is no improvement over the base set. "
	248	+"\n\nBackward elimination is similar but the initial base set has all "
	249	+"attributes included and races all single attribute deletions. "
	250	+"\n\nSchemata search is a bit different. Each iteration a series of "
	251	+"races are run in parallel. Each race in a set determines whether "
	252	+"a particular attribute should be included or not---ie the race is "
	253	+"between the attribute being \"in\" or \"out\". The other attributes "
	254	+"for this race are included or excluded randomly at each point in the "
	255	+"evaluation. As soon as one race "
	256	+"has a clear winner (ie it has been decided whether a particular "
	257	+"attribute should be inor not) then the next set of races begins, "
	258	+"using the result of the winning race from the previous iteration as "
	259	+"new base set.\n\nRank race first ranks the attributes using an "
	260	+"attribute evaluator and then races the ranking. The race includes "
	261	+"no attributes, the top ranked attribute, the top two attributes, the "
	262	+"top three attributes, etc.\n\nIt is also possible to generate a "
	263	+"raked list of attributes through the forward racing process. "
	264	+"If generateRanking is set to true then a complete forward race will "
	265	+"be run---that is, racing continues until all attributes have been "
	266	+"selected. The order that they are added in determines a complete "
	267	+"ranking of all the attributes.\n\nRacing uses paired and unpaired "
	268	+"t-tests on cross-validation errors of competing subsets. When there "
	269	+"is a significant difference between the means of the errors of two "
	270	+"competing subsets then the poorer of the two can be eliminated from "
	271	+"the race. Similarly, if there is no significant difference between "
	272	+"the mean errors of two competing subsets and they are within some "
	273	+"threshold of each other, then one can be eliminated from the race.\n\n"
	274	+ "For more information see:\n\n"
	275	+ getTechnicalInformation().toString();
	276	}
	277
	278	/**
	279	* Returns an instance of a TechnicalInformation object, containing
	280	* detailed information about the technical background of this class,
	281	* e.g., paper reference or book this class is based on.
	282	*
	283	* @return the technical information about this class
	284	*/
	285	public TechnicalInformation getTechnicalInformation() {
	286	TechnicalInformation result;
	287
	288	result = new TechnicalInformation(Type.INPROCEEDINGS);
	289	result.setValue(Field.AUTHOR, "Andrew W. Moore and Mary S. Lee");
	290	result.setValue(Field.TITLE, "Efficient Algorithms for Minimizing Cross Validation Error");
	291	result.setValue(Field.BOOKTITLE, "Eleventh International Conference on Machine Learning");
	292	result.setValue(Field.YEAR, "1994");
	293	result.setValue(Field.PAGES, "190-198");
	294	result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
	295
	296	return result;
	297	}
	298
	299	/**
	300	* Returns the tip text for this property
	301	* @return tip text for this property suitable for
	302	* displaying in the explorer/experimenter gui
	303	*/
	304	public String raceTypeTipText() {
	305	return "Set the type of search.";
	306	}
	307
	308	/**
	309	* Set the race type
	310	*
	311	* @param d the type of race
	312	*/
	313	public void setRaceType (SelectedTag d) {
	314
	315	if (d.getTags() == TAGS_SELECTION) {
	316	m_raceType = d.getSelectedTag().getID();
	317	}
	318	if (m_raceType == SCHEMATA_RACE && !m_rankingRequested) {
	319	try {
	320	setFoldsType(new SelectedTag(LEAVE_ONE_OUT,
	321	XVALTAGS_SELECTION));
	322	setSignificanceLevel(0.01);
	323	} catch (Exception ex) {
	324	}
	325	} else {
	326	try {
	327	setFoldsType(new SelectedTag(TEN_FOLD,
	328	XVALTAGS_SELECTION));
	329	setSignificanceLevel(0.001);
	330	} catch (Exception ex) {
	331	}
	332	}
	333	}
	334
	335	/**
	336	* Get the race type
	337	*
	338	* @return the type of race
	339	*/
	340	public SelectedTag getRaceType() {
	341	return new SelectedTag(m_raceType, TAGS_SELECTION);
	342	}
	343
	344	/**
	345	* Returns the tip text for this property
	346	* @return tip text for this property suitable for
	347	* displaying in the explorer/experimenter gui
	348	*/
	349	public String significanceLevelTipText() {
	350	return "Set the significance level to use for t-test comparisons.";
	351	}
	352
	353	/**
	354	* Sets the significance level to use
	355	* @param sig the significance level
	356	*/
	357	public void setSignificanceLevel(double sig) {
	358	m_sigLevel = sig;
	359	}
	360
	361	/**
	362	* Get the significance level
	363	* @return the current significance level
	364	*/
	365	public double getSignificanceLevel() {
	366	return m_sigLevel;
	367	}
	368
	369	/**
	370	* Returns the tip text for this property
	371	* @return tip text for this property suitable for
	372	* displaying in the explorer/experimenter gui
	373	*/
	374	public String thresholdTipText() {
	375	return "Set the error threshold by which to consider two subsets "
	376	+"equivalent.";
	377	}
	378
	379	/**
	380	* Sets the threshold for comparisons
	381	* @param t the threshold to use
	382	*/
	383	public void setThreshold(double t) {
	384	m_delta = t;
	385	}
	386
	387	/**
	388	* Get the threshold
	389	* @return the current threshold
	390	*/
	391	public double getThreshold() {
	392	return m_delta;
	393	}
	394
	395	/**
	396	* Returns the tip text for this property
	397	* @return tip text for this property suitable for
	398	* displaying in the explorer/experimenter gui
	399	*/
	400	public String foldsTypeTipText() {
	401	return "Set the number of folds to use for x-val error estimation; "
	402	+"leave-one-out is selected automatically for schemata search.";
	403	}
	404
	405	/**
	406	* Set the xfold type
	407	*
	408	* @param d the type of xval
	409	*/
	410	public void setFoldsType (SelectedTag d) {
	411
	412	if (d.getTags() == XVALTAGS_SELECTION) {
	413	m_xvalType = d.getSelectedTag().getID();
	414	}
	415	}
	416
	417	/**
	418	* Get the xfold type
	419	*
	420	* @return the type of xval
	421	*/
	422	public SelectedTag getFoldsType () {
	423	return new SelectedTag(m_xvalType, XVALTAGS_SELECTION);
	424	}
	425
	426	/**
	427	* Returns the tip text for this property
	428	* @return tip text for this property suitable for
	429	* displaying in the explorer/experimenter gui
	430	*/
	431	public String debugTipText() {
	432	return "Turn on verbose output for monitoring the search's progress.";
	433	}
	434
	435	/**
	436	* Set whether verbose output should be generated.
	437	* @param d true if output is to be verbose.
	438	*/
	439	public void setDebug(boolean d) {
	440	m_debug = d;
	441	}
	442
	443	/**
	444	* Get whether output is to be verbose
	445	* @return true if output will be verbose
	446	*/
	447	public boolean getDebug() {
	448	return m_debug;
	449	}
	450
	451	/**
	452	* Returns the tip text for this property
	453	* @return tip text for this property suitable for
	454	* displaying in the explorer/experimenter gui
	455	*/
	456	public String attributeEvaluatorTipText() {
	457	return "Attribute evaluator to use for generating an initial ranking. "
	458	+"Use in conjunction with a rank race";
	459	}
	460
	461	/**
	462	* Set the attribute evaluator to use for generating the ranking.
	463	* @param newEvaluator the attribute evaluator to use.
	464	*/
	465	public void setAttributeEvaluator(ASEvaluation newEvaluator) {
	466	m_ASEval = newEvaluator;
	467	}
	468
	469	/**
	470	* Get the attribute evaluator used to generate the ranking.
	471	* @return the evaluator used to generate the ranking.
	472	*/
	473	public ASEvaluation getAttributeEvaluator() {
	474	return m_ASEval;
	475	}
	476
	477	/**
	478	* Returns the tip text for this property
	479	* @return tip text for this property suitable for
	480	* displaying in the explorer/experimenter gui
	481	*/
	482	public String generateRankingTipText() {
	483	return "Use the racing process to generate a ranked list of attributes. "
	484	+"Using this mode forces the race to be a forward type and then races "
	485	+"until all attributes have been added, thus giving a ranked list";
	486	}
	487
	488	/**
	489	* Records whether the user has requested a ranked list of attributes.
	490	* @param doRank true if ranking is requested
	491	*/
	492	public void setGenerateRanking(boolean doRank) {
	493	m_rankingRequested = doRank;
	494	if (m_rankingRequested) {
	495	try {
	496	setRaceType(new SelectedTag(FORWARD_RACE,
	497	TAGS_SELECTION));
	498	} catch (Exception ex) {
	499	}
	500	}
	501	}
	502
	503	/**
	504	* Gets whether ranking has been requested. This is used by the
	505	* AttributeSelection module to determine if rankedAttributes()
	506	* should be called.
	507	* @return true if ranking has been requested.
	508	*/
	509	public boolean getGenerateRanking() {
	510	return m_rankingRequested;
	511	}
	512
	513	/**
	514	* Returns the tip text for this property
	515	* @return tip text for this property suitable for
	516	* displaying in the explorer/experimenter gui
	517	*/
	518	public String numToSelectTipText() {
	519	return "Specify the number of attributes to retain. Use in conjunction "
	520	+"with generateRanking. The default value "
	521	+"(-1) indicates that all attributes are to be retained. Use either "
	522	+"this option or a threshold to reduce the attribute set.";
	523	}
	524
	525	/**
	526	* Specify the number of attributes to select from the ranked list
	527	* (if generating a ranking). -1
	528	* indicates that all attributes are to be retained.
	529	* @param n the number of attributes to retain
	530	*/
	531	public void setNumToSelect(int n) {
	532	m_numToSelect = n;
	533	}
	534
	535	/**
	536	* Gets the number of attributes to be retained.
	537	* @return the number of attributes to retain
	538	*/
	539	public int getNumToSelect() {
	540	return m_numToSelect;
	541	}
	542
	543	/**
	544	* Gets the calculated number of attributes to retain. This is the
	545	* actual number of attributes to retain. This is the same as
	546	* getNumToSelect if the user specifies a number which is not less
	547	* than zero. Otherwise it should be the number of attributes in the
	548	* (potentially transformed) data.
	549	*/
	550	public int getCalculatedNumToSelect() {
	551	if (m_numToSelect >= 0) {
	552	m_calculatedNumToSelect = m_numToSelect;
	553	}
	554	return m_calculatedNumToSelect;
	555	}
	556
	557	/**
	558	* Returns the tip text for this property
	559	* @return tip text for this property suitable for
	560	* displaying in the explorer/experimenter gui
	561	*/
	562	public String selectionThresholdTipText() {
	563	return "Set threshold by which attributes can be discarded. Default value "
	564	+ "results in no attributes being discarded. Use in conjunction with "
	565	+ "generateRanking";
	566	}
	567
	568	/**
	569	* Set the threshold by which the AttributeSelection module can discard
	570	* attributes.
	571	* @param threshold the threshold.
	572	*/
	573	public void setSelectionThreshold(double threshold) {
	574	m_threshold = threshold;
	575	}
	576
	577	/**
	578	* Returns the threshold so that the AttributeSelection module can
	579	* discard attributes from the ranking.
	580	*/
	581	public double getSelectionThreshold() {
	582	return m_threshold;
	583	}
	584
	585
	586	/**
	587	* Returns an enumeration describing the available options.
	588	* @return an enumeration of all the available options.
	589	**/
	590	public Enumeration listOptions () {
	591	Vector newVector = new Vector();
	592
	593	newVector.addElement(new Option(
	594	"\tType of race to perform.\n"
	595	+ "\t(default = 0).",
	596	"R", 1 ,"-R <0 = forward \| 1 = backward race \| 2 = schemata \| 3 = rank>"));
	597
	598	newVector.addElement(new Option(
	599	"\tSignificance level for comaparisons\n"
	600	+ "\t(default = 0.001(forward/backward/rank)/0.01(schemata)).",
	601	"L",1,"-L <significance>"));
	602
	603	newVector.addElement(new Option(
	604	"\tThreshold for error comparison.\n"
	605	+ "\t(default = 0.001).",
	606	"T",1,"-T <threshold>"));
	607
	608	newVector.addElement(new Option(
	609	"\tAttribute ranker to use if doing a \n"
	610	+ "\trank search. Place any\n"
	611	+ "\tevaluator options LAST on \n"
	612	+ "\tthe command line following a \"--\".\n"
	613	+ "\teg. -A weka.attributeSelection.GainRatioAttributeEval ... -- -M.\n"
	614	+ "\t(default = GainRatioAttributeEval)",
	615	"A", 1, "-A <attribute evaluator>"));
	616
	617	newVector.addElement(new Option(
	618	"\tFolds for cross validation\n"
	619	+ "\t(default = 0 (1 if schemata race)",
	620	"F",1,"-F <0 = 10 fold \| 1 = leave-one-out>"));
	621
	622	newVector.addElement(new Option(
	623	"\tGenerate a ranked list of attributes.\n"
	624	+"\tForces the search to be forward\n"
	625	+"\tand races until all attributes have\n"
	626	+"\tselected, thus producing a ranking.",
	627	"Q",0,"-Q"));
	628
	629	newVector.addElement(new Option(
	630	"\tSpecify number of attributes to retain from \n"
	631	+ "\tthe ranking. Overides -T. Use in conjunction with -Q",
	632	"N", 1, "-N <num to select>"));
	633
	634	newVector.addElement(new Option(
	635	"\tSpecify a theshold by which attributes\n"
	636	+ "\tmay be discarded from the ranking.\n"
	637	+"\tUse in conjuction with -Q",
	638	"J",1, "-J <threshold>"));
	639
	640	newVector.addElement(new Option(
	641	"\tVerbose output for monitoring the search.",
	642	"Z",0,"-Z"));
	643
	644	if ((m_ASEval != null) &&
	645	(m_ASEval instanceof OptionHandler)) {
	646	newVector.addElement(new Option(
	647	"",
	648	"", 0, "\nOptions specific to evaluator "
	649	+ m_ASEval.getClass().getName() + ":"));
	650
	651	Enumeration enu = ((OptionHandler)m_ASEval).listOptions();
	652	while (enu.hasMoreElements()) {
	653	newVector.addElement(enu.nextElement());
	654	}
	655	}
	656
	657	return newVector.elements();
	658	}
	659
	660	/**
	661	* Parses a given list of options. <p/>
	662	*
	663	<!-- options-start -->
	664	* Valid options are: <p/>
	665	*
	666	* <pre> -R <0 = forward \| 1 = backward race \| 2 = schemata \| 3 = rank>
	667	* Type of race to perform.
	668	* (default = 0).</pre>
	669	*
	670	* <pre> -L <significance>
	671	* Significance level for comaparisons
	672	* (default = 0.001(forward/backward/rank)/0.01(schemata)).</pre>
	673	*
	674	* <pre> -T <threshold>
	675	* Threshold for error comparison.
	676	* (default = 0.001).</pre>
	677	*
	678	* <pre> -A <attribute evaluator>
	679	* Attribute ranker to use if doing a
	680	* rank search. Place any
	681	* evaluator options LAST on
	682	* the command line following a "--".
	683	* eg. -A weka.attributeSelection.GainRatioAttributeEval ... -- -M.
	684	* (default = GainRatioAttributeEval)</pre>
	685	*
	686	* <pre> -F <0 = 10 fold \| 1 = leave-one-out>
	687	* Folds for cross validation
	688	* (default = 0 (1 if schemata race)</pre>
	689	*
	690	* <pre> -Q
	691	* Generate a ranked list of attributes.
	692	* Forces the search to be forward
	693	* and races until all attributes have
	694	* selected, thus producing a ranking.</pre>
	695	*
	696	* <pre> -N <num to select>
	697	* Specify number of attributes to retain from
	698	* the ranking. Overides -T. Use in conjunction with -Q</pre>
	699	*
	700	* <pre> -J <threshold>
	701	* Specify a theshold by which attributes
	702	* may be discarded from the ranking.
	703	* Use in conjuction with -Q</pre>
	704	*
	705	* <pre> -Z
	706	* Verbose output for monitoring the search.</pre>
	707	*
	708	* <pre>
	709	* Options specific to evaluator weka.attributeSelection.GainRatioAttributeEval:
	710	* </pre>
	711	*
	712	* <pre> -M
	713	* treat missing values as a seperate value.</pre>
	714	*
	715	<!-- options-end -->
	716	*
	717	* @param options the list of options as an array of strings
	718	* @throws Exception if an option is not supported
	719	*/
	720	public void setOptions (String[] options)
	721	throws Exception {
	722	String optionString;
	723	resetOptions();
	724
	725	optionString = Utils.getOption('R', options);
	726	if (optionString.length() != 0) {
	727	setRaceType(new SelectedTag(Integer.parseInt(optionString),
	728	TAGS_SELECTION));
	729	}
	730
	731	optionString = Utils.getOption('F', options);
	732	if (optionString.length() != 0) {
	733	setFoldsType(new SelectedTag(Integer.parseInt(optionString),
	734	XVALTAGS_SELECTION));
	735	}
	736
	737	optionString = Utils.getOption('L', options);
	738	if (optionString.length() !=0) {
	739	setSignificanceLevel(Double.parseDouble(optionString));
	740	}
	741
	742	optionString = Utils.getOption('T', options);
	743	if (optionString.length() !=0) {
	744	setThreshold(Double.parseDouble(optionString));
	745	}
	746
	747	optionString = Utils.getOption('A', options);
	748	if (optionString.length() != 0) {
	749	setAttributeEvaluator(ASEvaluation.forName(optionString,
	750	Utils.partitionOptions(options)));
	751	}
	752
	753	setGenerateRanking(Utils.getFlag('Q', options));
	754
	755	optionString = Utils.getOption('J', options);
	756	if (optionString.length() != 0) {
	757	setSelectionThreshold(Double.parseDouble(optionString));
	758	}
	759
	760	optionString = Utils.getOption('N', options);
	761	if (optionString.length() != 0) {
	762	setNumToSelect(Integer.parseInt(optionString));
	763	}
	764
	765	setDebug(Utils.getFlag('Z', options));
	766	}
	767
	768	/**
	769	* Gets the current settings of BestFirst.
	770	* @return an array of strings suitable for passing to setOptions()
	771	*/
	772	public String[] getOptions () {
	773	int current = 0;
	774	String[] evaluatorOptions = new String[0];
	775
	776	if ((m_ASEval != null) &&
	777	(m_ASEval instanceof OptionHandler)) {
	778	evaluatorOptions = ((OptionHandler)m_ASEval).getOptions();
	779	}
	780	String[] options = new String[17+evaluatorOptions.length];
	781
	782	options[current++] = "-R"; options[current++] = ""+m_raceType;
	783	options[current++] = "-L"; options[current++] = ""+getSignificanceLevel();
	784	options[current++] = "-T"; options[current++] = ""+getThreshold();
	785	options[current++] = "-F"; options[current++] = ""+m_xvalType;
	786	if (getGenerateRanking()) {
	787	options[current++] = "-Q";
	788	}
	789	options[current++] = "-N"; options[current++] = ""+getNumToSelect();
	790	options[current++] = "-J"; options[current++] = ""+getSelectionThreshold();
	791	if (getDebug()) {
	792	options[current++] = "-Z";
	793	}
	794
	795	if (getAttributeEvaluator() != null) {
	796	options[current++] = "-A";
	797	options[current++] = getAttributeEvaluator().getClass().getName();
	798	options[current++] = "--";
	799	System.arraycopy(evaluatorOptions, 0, options, current,
	800	evaluatorOptions.length);
	801	current += evaluatorOptions.length;
	802	}
	803
	804
	805	while (current < options.length) {
	806	options[current++] = "";
	807	}
	808
	809	return options;
	810	}
	811
	812
	813
	814
	815	/**
	816	* Searches the attribute subset space by racing cross validation
	817	* errors of competing subsets
	818	*
	819	* @param ASEval the attribute evaluator to guide the search
	820	* @param data the training instances.
	821	* @return an array (not necessarily ordered) of selected attribute indexes
	822	* @throws Exception if the search can't be completed
	823	*/
	824	public int[] search (ASEvaluation ASEval, Instances data)
	825	throws Exception {
	826	if (!(ASEval instanceof SubsetEvaluator)) {
	827	throw new Exception(ASEval.getClass().getName()
	828	+ " is not a "
	829	+ "Subset evaluator! (RaceSearch)");
	830	}
	831
	832	if (ASEval instanceof UnsupervisedSubsetEvaluator) {
	833	throw new Exception("Can't use an unsupervised subset evaluator "
	834	+"(RaceSearch).");
	835	}
	836
	837	if (!(ASEval instanceof HoldOutSubsetEvaluator)) {
	838	throw new Exception("Must use a HoldOutSubsetEvaluator, eg. "
	839	+"weka.attributeSelection.ClassifierSubsetEval "
	840	+"(RaceSearch)");
	841	}
	842
	843	if (!(ASEval instanceof ErrorBasedMeritEvaluator)) {
	844	throw new Exception("Only error based subset evaluators can be used, "
	845	+"eg. weka.attributeSelection.ClassifierSubsetEval "
	846	+"(RaceSearch)");
	847	}
	848
	849	m_Instances = new Instances(data);
	850	m_Instances.deleteWithMissingClass();
	851	if (m_Instances.numInstances() == 0) {
	852	throw new Exception("All train instances have missing class! (RaceSearch)");
	853	}
	854	if (m_rankingRequested && m_numToSelect > m_Instances.numAttributes()-1) {
	855	throw new Exception("More attributes requested than exist in the data "
	856	+"(RaceSearch).");
	857	}
	858	m_theEvaluator = (HoldOutSubsetEvaluator)ASEval;
	859	m_numAttribs = m_Instances.numAttributes();
	860	m_classIndex = m_Instances.classIndex();
	861
	862	if (m_rankingRequested) {
	863	m_rankedAtts = new double[m_numAttribs-1][2];
	864	m_rankedSoFar = 0;
	865	}
	866
	867	if (m_xvalType == LEAVE_ONE_OUT) {
	868	m_numFolds = m_Instances.numInstances();
	869	} else {
	870	m_numFolds = 10;
	871	}
	872
	873	Random random = new Random(1); // I guess this should really be a parameter?
	874	m_Instances.randomize(random);
	875	int [] bestSubset=null;
	876
	877	switch (m_raceType) {
	878	case FORWARD_RACE:
	879	case BACKWARD_RACE:
	880	bestSubset = hillclimbRace(m_Instances, random);
	881	break;
	882	case SCHEMATA_RACE:
	883	bestSubset = schemataRace(m_Instances, random);
	884	break;
	885	case RANK_RACE:
	886	bestSubset = rankRace(m_Instances, random);
	887	break;
	888	}
	889
	890	return bestSubset;
	891	}
	892
	893	public double [][] rankedAttributes() throws Exception {
	894	if (!m_rankingRequested) {
	895	throw new Exception("Need to request a ranked list of attributes "
	896	+"before attributes can be ranked (RaceSearch).");
	897	}
	898	if (m_rankedAtts == null) {
	899	throw new Exception("Search must be performed before attributes "
	900	+"can be ranked (RaceSearch).");
	901	}
	902
	903	double [][] final_rank = new double [m_rankedSoFar][2];
	904	for (int i=0;i<m_rankedSoFar;i++) {
	905	final_rank[i][0] = m_rankedAtts[i][0];
	906	final_rank[i][1] = m_rankedAtts[i][1];
	907	}
	908
	909	if (m_numToSelect <= 0) {
	910	if (m_threshold == -Double.MAX_VALUE) {
	911	m_calculatedNumToSelect = final_rank.length;
	912	} else {
	913	determineNumToSelectFromThreshold(final_rank);
	914	}
	915	}
	916
	917	return final_rank;
	918	}
	919
	920	private void determineNumToSelectFromThreshold(double [][] ranking) {
	921	int count = 0;
	922	for (int i = 0; i < ranking.length; i++) {
	923	if (ranking[i][1] > m_threshold) {
	924	count++;
	925	}
	926	}
	927	m_calculatedNumToSelect = count;
	928	}
	929
	930	/**
	931	* Print an attribute set.
	932	*/
	933	private String printSets(char [][]raceSets) {
	934	StringBuffer temp = new StringBuffer();
	935	for (int i=0;i<raceSets.length;i++) {
	936	for (int j=0;j<m_numAttribs;j++) {
	937	temp.append(raceSets[i][j]);
	938	}
	939	temp.append('\n');
	940	}
	941	return temp.toString();
	942	}
	943
	944	/**
	945	* Performs a schemata race---a series of races in parallel.
	946	* @param data the instances to estimate accuracy over.
	947	* @param random a random number generator
	948	* @return an array of selected attribute indices.
	949	*/
	950	private int [] schemataRace(Instances data, Random random) throws Exception {
	951	// # races, 2 (competitors in each race), # attributes
	952	char [][][] parallelRaces;
	953	int numRaces = m_numAttribs-1;
	954	Random r = new Random(42);
	955	int numInstances = data.numInstances();
	956	Instances trainCV; Instances testCV;
	957	Instance testInstance;
	958
	959	// statistics on the racers
	960	Stats [][] raceStats = new Stats[numRaces][2];
	961
	962	parallelRaces = new char [numRaces][2][m_numAttribs-1];
	963	char [] base = new char [m_numAttribs];
	964	for (int i=0;i<m_numAttribs;i++) {
	965	base[i] = '*';
	966	}
	967
	968	int count=0;
	969	// set up initial races
	970	for (int i=0;i<m_numAttribs;i++) {
	971	if (i != m_classIndex) {
	972	parallelRaces[count][0] = (char [])base.clone();
	973	parallelRaces[count][1] = (char [])base.clone();
	974	parallelRaces[count][0][i] = '1';
	975	parallelRaces[count++][1][i] = '0';
	976	}
	977	}
	978
	979	if (m_debug) {
	980	System.err.println("Initial sets:\n");
	981	for (int i=0;i<numRaces;i++) {
	982	System.err.print(printSets(parallelRaces[i])+"--------------\n");
	983	}
	984	}
	985
	986	BitSet randomB = new BitSet(m_numAttribs);
	987	char [] randomBC = new char [m_numAttribs];
	988
	989	// notes which bit positions have been decided
	990	boolean [] attributeConstraints = new boolean[m_numAttribs];
	991	double error;
	992	int evaluationCount = 0;
	993	raceSet: while (numRaces > 0) {
	994	boolean won = false;
	995	for (int i=0;i<numRaces;i++) {
	996	raceStats[i][0] = new Stats();
	997	raceStats[i][1] = new Stats();
	998	}
	999
	1000	// keep an eye on how many test instances have been randomly sampled
	1001	int sampleCount = 0;
	1002	// run the current set of races
	1003	while (!won) {
	1004	// generate a random binary string
	1005	for (int i=0;i<m_numAttribs;i++) {
	1006	if (i != m_classIndex) {
	1007	if (!attributeConstraints[i]) {
	1008	if (r.nextDouble() < 0.5) {
	1009	randomB.set(i);
	1010	} else {
	1011	randomB.clear(i);
	1012	}
	1013	} else { // this position has been decided from previous races
	1014	if (base[i] == '1') {
	1015	randomB.set(i);
	1016	} else {
	1017	randomB.clear(i);
	1018	}
	1019	}
	1020	}
	1021	}
	1022
	1023	// randomly select an instance to test on
	1024	int testIndex = Math.abs(r.nextInt() % numInstances);
	1025
	1026
	1027	// We want to randomize the data the same way for every
	1028	// learning scheme.
	1029	trainCV = data.trainCV(numInstances, testIndex, new Random (1));
	1030	testCV = data.testCV(numInstances, testIndex);
	1031	testInstance = testCV.instance(0);
	1032	sampleCount++;
	1033	/* if (sampleCount > numInstances) {
	1034	throw new Exception("raceSchemata: No clear winner after sampling "
	1035	+sampleCount+" instances.");
	1036	} */
	1037
	1038	m_theEvaluator.buildEvaluator(trainCV);
	1039
	1040	// the evaluator must retrain for every test point
	1041	error = -((HoldOutSubsetEvaluator)m_theEvaluator).
	1042	evaluateSubset(randomB,
	1043	testInstance,
	1044	true);
	1045	evaluationCount++;
	1046
	1047	// see which racers match this random subset
	1048	for (int i=0;i<m_numAttribs;i++) {
	1049	if (randomB.get(i)) {
	1050	randomBC[i] = '1';
	1051	} else {
	1052	randomBC[i] = '0';
	1053	}
	1054	}
	1055	// System.err.println("Random subset: "+(new String(randomBC)));
	1056
	1057	checkRaces: for (int i=0;i<numRaces;i++) {
	1058	// if a pair of racers has evaluated more than num instances
	1059	// then bail out---unlikely that having any more atts is any
	1060	// better than the current base set.
	1061	if (((raceStats[i][0].count + raceStats[i][1].count) / 2) >
	1062	(numInstances)) {
	1063	break raceSet;
	1064	}
	1065	for (int j=0;j<2;j++) {
	1066	boolean matched = true;
	1067	for (int k =0;k<m_numAttribs;k++) {
	1068	if (parallelRaces[i][j][k] != '*') {
	1069	if (parallelRaces[i][j][k] != randomBC[k]) {
	1070	matched = false;
	1071	break;
	1072	}
	1073	}
	1074	}
	1075	if (matched) { // update the stats for this racer
	1076	// System.err.println("Matched "+i+" "+j);
	1077	raceStats[i][j].add(error);
	1078
	1079	// does this race have a clear winner, meaning we can
	1080	// terminate the whole set of parallel races?
	1081	if (raceStats[i][0].count > m_samples &&
	1082	raceStats[i][1].count > m_samples) {
	1083	raceStats[i][0].calculateDerived();
	1084	raceStats[i][1].calculateDerived();
	1085	// System.err.println(j+" : "+(new String(parallelRaces[i][j])));
	1086	// System.err.println(raceStats[i][0]);
	1087	// System.err.println(raceStats[i][1]);
	1088	// check the ttest
	1089	double prob = ttest(raceStats[i][0], raceStats[i][1]);
	1090	// System.err.println("Prob :"+prob);
	1091	if (prob < m_sigLevel) { // stop the races we have a winner!
	1092	if (raceStats[i][0].mean < raceStats[i][1].mean) {
	1093	base = (char [])parallelRaces[i][0].clone();
	1094	m_bestMerit = raceStats[i][0].mean;
	1095	if (m_debug) {
	1096	System.err.println("contender 0 won ");
	1097	}
	1098	} else {
	1099	base = (char [])parallelRaces[i][1].clone();
	1100	m_bestMerit = raceStats[i][1].mean;
	1101	if (m_debug) {
	1102	System.err.println("contender 1 won");
	1103	}
	1104	}
	1105	if (m_debug) {
	1106	System.err.println((new String(parallelRaces[i][0]))
	1107	+" "+(new String(parallelRaces[i][1])));
	1108	System.err.println("Means : "+raceStats[i][0].mean
	1109	+" vs"+raceStats[i][1].mean);
	1110	System.err.println("Evaluations so far : "
	1111	+evaluationCount);
	1112	}
	1113	won = true;
	1114	break checkRaces;
	1115	}
	1116	}
	1117
	1118	}
	1119	}
	1120	}
	1121	}
	1122
	1123	numRaces--;
	1124	// set up the next set of races if necessary
	1125	if (numRaces > 0 && won) {
	1126	parallelRaces = new char [numRaces][2][m_numAttribs-1];
	1127	raceStats = new Stats[numRaces][2];
	1128	// update the attribute constraints
	1129	for (int i=0;i<m_numAttribs;i++) {
	1130	if (i != m_classIndex && !attributeConstraints[i] &&
	1131	base[i] != '*') {
	1132	attributeConstraints[i] = true;
	1133	break;
	1134	}
	1135	}
	1136	count=0;
	1137	for (int i=0;i<numRaces;i++) {
	1138	parallelRaces[i][0] = (char [])base.clone();
	1139	parallelRaces[i][1] = (char [])base.clone();
	1140	for (int j=count;j<m_numAttribs;j++) {
	1141	if (j != m_classIndex && parallelRaces[i][0][j] == '*') {
	1142	parallelRaces[i][0][j] = '1';
	1143	parallelRaces[i][1][j] = '0';
	1144	count = j+1;
	1145	break;
	1146	}
	1147	}
	1148	}
	1149
	1150	if (m_debug) {
	1151	System.err.println("Next sets:\n");
	1152	for (int i=0;i<numRaces;i++) {
	1153	System.err.print(printSets(parallelRaces[i])+"--------------\n");
	1154	}
	1155	}
	1156	}
	1157	}
	1158
	1159	if (m_debug) {
	1160	System.err.println("Total evaluations : "
	1161	+evaluationCount);
	1162	}
	1163	return attributeList(base);
	1164	}
	1165
	1166	/**
	1167	* t-test for unequal sample sizes and same variance. Returns probability
	1168	* that observed difference in means is due to chance.
	1169	*/
	1170	private double ttest(Stats c1, Stats c2) throws Exception {
	1171	double n1 = c1.count; double n2 = c2.count;
	1172	double v1 = c1.stdDev * c1.stdDev;
	1173	double v2 = c2.stdDev * c2.stdDev;
	1174	double av1 = c1.mean;
	1175	double av2 = c2.mean;
	1176
	1177	double df = n1 + n2 - 2;
	1178	double cv = (((n1 - 1) * v1) + ((n2 - 1) * v2)) /df;
	1179	double t = (av1 - av2) / Math.sqrt(cv * ((1.0 / n1) + (1.0 / n2)));
	1180
	1181	return Statistics.incompleteBeta(df / 2.0, 0.5,
	1182	df / (df + (t * t)));
	1183	}
	1184
	1185	/**
	1186	* Performs a rank race---race consisting of no attributes, the top
	1187	* ranked attribute, the top two attributes etc. The initial ranking
	1188	* is determined by an attribute evaluator.
	1189	* @param data the instances to estimate accuracy over
	1190	* @param random a random number generator
	1191	* @return an array of selected attribute indices.
	1192	*/
	1193	private int [] rankRace(Instances data, Random random) throws Exception {
	1194	char [] baseSet = new char [m_numAttribs];
	1195	char [] bestSet;
	1196	double bestSetError;
	1197	for (int i=0;i<m_numAttribs;i++) {
	1198	if (i == m_classIndex) {
	1199	baseSet[i] = '-';
	1200	} else {
	1201	baseSet[i] = '0';
	1202	}
	1203	}
	1204
	1205	int numCompetitors = m_numAttribs-1;
	1206	char [][] raceSets = new char [numCompetitors+1][m_numAttribs];
	1207
	1208	if (m_ASEval instanceof AttributeEvaluator) {
	1209	// generate the attribute ranking first
	1210	Ranker ranker = new Ranker();
	1211	m_ASEval.buildEvaluator(data);
	1212	m_Ranking = ranker.search(m_ASEval,data);
	1213	} else {
	1214	GreedyStepwise fs = new GreedyStepwise();
	1215	double [][]rankres;
	1216	fs.setGenerateRanking(true);
	1217	((ASEvaluation)m_ASEval).buildEvaluator(data);
	1218	fs.search(m_ASEval, data);
	1219	rankres = fs.rankedAttributes();
	1220	m_Ranking = new int[rankres.length];
	1221	for (int i=0;i<rankres.length;i++) {
	1222	m_Ranking[i] = (int)rankres[i][0];
	1223	}
	1224	}
	1225
	1226	// set up the race
	1227	raceSets[0] = (char [])baseSet.clone();
	1228	for (int i=0;i<m_Ranking.length;i++) {
	1229	raceSets[i+1] = (char [])raceSets[i].clone();
	1230	raceSets[i+1][m_Ranking[i]] = '1';
	1231	}
	1232
	1233	if (m_debug) {
	1234	System.err.println("Initial sets:\n"+printSets(raceSets));
	1235	}
	1236
	1237	// run the race
	1238	double [] winnerInfo = raceSubsets(raceSets, data, true, random);
	1239	bestSetError = winnerInfo[1];
	1240	bestSet = (char [])raceSets[(int)winnerInfo[0]].clone();
	1241	m_bestMerit = bestSetError;
	1242	return attributeList(bestSet);
	1243	}
	1244
	1245	/**
	1246	* Performs a hill climbing race---all single attribute changes to a
	1247	* base subset are raced in parallel. The winner is chosen and becomes
	1248	* the new base subset and the process is repeated until there is no
	1249	* improvement in error over the base subset.
	1250	* @param data the instances to estimate accuracy over
	1251	* @param random a random number generator
	1252	* @return an array of selected attribute indices.
	1253	* @throws Exception if something goes wrong
	1254	*/
	1255	private int [] hillclimbRace(Instances data, Random random) throws Exception {
	1256	double baseSetError;
	1257	char [] baseSet = new char [m_numAttribs];
	1258
	1259	for (int i=0;i<m_numAttribs;i++) {
	1260	if (i != m_classIndex) {
	1261	if (m_raceType == FORWARD_RACE) {
	1262	baseSet[i] = '0';
	1263	} else {
	1264	baseSet[i] = '1';
	1265	}
	1266	} else {
	1267	baseSet[i] = '-';
	1268	}
	1269	}
	1270
	1271	int numCompetitors = m_numAttribs-1;
	1272	char [][] raceSets = new char [numCompetitors+1][m_numAttribs];
	1273
	1274	raceSets[0] = (char [])baseSet.clone();
	1275	int count = 1;
	1276	// initialize each race set to 1 attribute
	1277	for (int i=0;i<m_numAttribs;i++) {
	1278	if (i != m_classIndex) {
	1279	raceSets[count] = (char [])baseSet.clone();
	1280	if (m_raceType == BACKWARD_RACE) {
	1281	raceSets[count++][i] = '0';
	1282	} else {
	1283	raceSets[count++][i] = '1';
	1284	}
	1285	}
	1286	}
	1287
	1288	if (m_debug) {
	1289	System.err.println("Initial sets:\n"+printSets(raceSets));
	1290	}
	1291
	1292	// race the initial sets (base set either no or all features)
	1293	double [] winnerInfo = raceSubsets(raceSets, data, true, random);
	1294	baseSetError = winnerInfo[1];
	1295	m_bestMerit = baseSetError;
	1296	baseSet = (char [])raceSets[(int)winnerInfo[0]].clone();
	1297	if (m_rankingRequested) {
	1298	m_rankedAtts[m_rankedSoFar][0] = (int)(winnerInfo[0]-1);
	1299	m_rankedAtts[m_rankedSoFar][1] = winnerInfo[1];
	1300	m_rankedSoFar++;
	1301	}
	1302
	1303	boolean improved = true;
	1304	int j;
	1305	// now race until there is no improvement over the base set or only
	1306	// one competitor remains
	1307	while (improved) {
	1308	// generate the next set of competitors
	1309	numCompetitors--;
	1310	if (numCompetitors == 0) { //race finished!
	1311	break;
	1312	}
	1313	j=0;
	1314	// +1. we'll race against the base set---might be able to bail out
	1315	// of the race if none from the new set are statistically better
	1316	// than the base set. Base set is stored in loc 0.
	1317	raceSets = new char [numCompetitors+1][m_numAttribs];
	1318	for (int i=0;i<numCompetitors+1;i++) {
	1319	raceSets[i] = (char [])baseSet.clone();
	1320	if (i > 0) {
	1321	for (int k=j;k<m_numAttribs;k++) {
	1322	if (m_raceType == 1) {
	1323	if (k != m_classIndex && raceSets[i][k] != '0') {
	1324	raceSets[i][k] = '0';
	1325	j = k+1;
	1326	break;
	1327	}
	1328	} else {
	1329	if (k != m_classIndex && raceSets[i][k] != '1') {
	1330	raceSets[i][k] = '1';
	1331	j = k+1;
	1332	break;
	1333	}
	1334	}
	1335	}
	1336	}
	1337	}
	1338
	1339	if (m_debug) {
	1340	System.err.println("Next set : \n"+printSets(raceSets));
	1341	}
	1342	improved = false;
	1343	winnerInfo = raceSubsets(raceSets, data, true, random);
	1344	String bs = new String(baseSet);
	1345	String win = new String(raceSets[(int)winnerInfo[0]]);
	1346	if (bs.compareTo(win) == 0) {
	1347	// race finished
	1348	} else {
	1349	if (winnerInfo[1] < baseSetError \|\| m_rankingRequested) {
	1350	improved = true;
	1351	baseSetError = winnerInfo[1];
	1352	m_bestMerit = baseSetError;
	1353	// find which att is different
	1354	if (m_rankingRequested) {
	1355	for (int i = 0; i < baseSet.length; i++) {
	1356	if (win.charAt(i) != bs.charAt(i)) {
	1357	m_rankedAtts[m_rankedSoFar][0] = i;
	1358	m_rankedAtts[m_rankedSoFar][1] = winnerInfo[1];
	1359	m_rankedSoFar++;
	1360	}
	1361	}
	1362	}
	1363	baseSet = (char [])raceSets[(int)winnerInfo[0]].clone();
	1364	} else {
	1365	// Will get here for a subset whose error is outside the delta
	1366	// threshold but is not significantly worse than the base
	1367	// subset
	1368	//throw new Exception("RaceSearch: problem in hillClimbRace");
	1369	}
	1370	}
	1371	}
	1372	return attributeList(baseSet);
	1373	}
	1374
	1375	/**
	1376	* Convert an attribute set to an array of indices
	1377	*/
	1378	private int [] attributeList(char [] list) {
	1379	int count = 0;
	1380
	1381	for (int i=0;i<m_numAttribs;i++) {
	1382	if (list[i] == '1') {
	1383	count++;
	1384	}
	1385	}
	1386
	1387	int [] rlist = new int[count];
	1388	count = 0;
	1389	for (int i=0;i<m_numAttribs;i++) {
	1390	if (list[i] == '1') {
	1391	rlist[count++] = i;
	1392	}
	1393	}
	1394
	1395	return rlist;
	1396	}
	1397
	1398	/**
	1399	* Races the leave-one-out cross validation errors of a set of
	1400	* attribute subsets on a set of instances.
	1401	* @param raceSets a set of attribute subset specifications
	1402	* @param data the instances to use when cross validating
	1403	* @param baseSetIncluded true if the first attribute set is a
	1404	* base set generated from the previous race
	1405	* @param random a random number generator
	1406	* @return the index of the winning subset
	1407	* @throws Exception if an error occurs during cross validation
	1408	*/
	1409	private double [] raceSubsets(char [][]raceSets, Instances data,
	1410	boolean baseSetIncluded, Random random)
	1411	throws Exception {
	1412	// the evaluators --- one for each subset
	1413	ASEvaluation [] evaluators =
	1414	ASEvaluation.makeCopies(m_theEvaluator, raceSets.length);
	1415
	1416	// array of subsets eliminated from the race
	1417	boolean [] eliminated = new boolean [raceSets.length];
	1418
	1419	// individual statistics
	1420	Stats [] individualStats = new Stats [raceSets.length];
	1421
	1422	// pairwise statistics
	1423	PairedStats [][] testers =
	1424	new PairedStats[raceSets.length][raceSets.length];
	1425
	1426	/** do we ignore the base set or not? */
	1427	int startPt = m_rankingRequested ? 1 : 0;
	1428
	1429	for (int i=0;i<raceSets.length;i++) {
	1430	individualStats[i] = new Stats();
	1431	for (int j=i+1;j<raceSets.length;j++) {
	1432	testers[i][j] = new PairedStats(m_sigLevel);
	1433	}
	1434	}
	1435
	1436	BitSet [] raceBitSets = new BitSet[raceSets.length];
	1437	for (int i=0;i<raceSets.length;i++) {
	1438	raceBitSets[i] = new BitSet(m_numAttribs);
	1439	for (int j=0;j<m_numAttribs;j++) {
	1440	if (raceSets[i][j] == '1') {
	1441	raceBitSets[i].set(j);
	1442	}
	1443	}
	1444	}
	1445
	1446	// now loop over the data points collecting leave-one-out errors for
	1447	// each attribute set
	1448	Instances trainCV;
	1449	Instances testCV;
	1450	Instance testInst;
	1451	double [] errors = new double [raceSets.length];
	1452	int eliminatedCount = 0;
	1453	int processedCount = 0;
	1454	// if there is one set left in the race then we need to continue to
	1455	// evaluate it for the remaining instances in order to get an
	1456	// accurate error estimate
	1457	processedCount = 0;
	1458	race: for (int i=0;i<m_numFolds;i++) {
	1459
	1460	// We want to randomize the data the same way for every
	1461	// learning scheme.
	1462	trainCV = data.trainCV(m_numFolds, i, new Random (1));
	1463	testCV = data.testCV(m_numFolds, i);
	1464
	1465	// loop over the surviving attribute sets building classifiers for this
	1466	// training set
	1467	for (int j=startPt;j<raceSets.length;j++) {
	1468	if (!eliminated[j]) {
	1469	evaluators[j].buildEvaluator(trainCV);
	1470	}
	1471	}
	1472
	1473	for (int z=0;z<testCV.numInstances();z++) {
	1474	testInst = testCV.instance(z);
	1475	processedCount++;
	1476
	1477	// loop over surviving attribute sets computing errors for this
	1478	// test point
	1479	for (int zz=startPt;zz<raceSets.length;zz++) {
	1480	if (!eliminated[zz]) {
	1481	if (z == 0) {// first test instance---make sure classifier is built
	1482	errors[zz] = -((HoldOutSubsetEvaluator)evaluators[zz]).
	1483	evaluateSubset(raceBitSets[zz],
	1484	testInst,
	1485	true);
	1486	} else { // must be k fold rather than leave one out
	1487	errors[zz] = -((HoldOutSubsetEvaluator)evaluators[zz]).
	1488	evaluateSubset(raceBitSets[zz],
	1489	testInst,
	1490	false);
	1491	}
	1492	}
	1493	}
	1494
	1495	// now update the stats
	1496	for (int j=startPt;j<raceSets.length;j++) {
	1497	if (!eliminated[j]) {
	1498	individualStats[j].add(errors[j]);
	1499	for (int k=j+1;k<raceSets.length;k++) {
	1500	if (!eliminated[k]) {
	1501	testers[j][k].add(errors[j], errors[k]);
	1502	}
	1503	}
	1504	}
	1505	}
	1506
	1507	// test for near identical models and models that are significantly
	1508	// worse than some other model
	1509	if (processedCount > m_samples-1 &&
	1510	(eliminatedCount < raceSets.length-1)) {
	1511	for (int j=0;j<raceSets.length;j++) {
	1512	if (!eliminated[j]) {
	1513	for (int k=j+1;k<raceSets.length;k++) {
	1514	if (!eliminated[k]) {
	1515	testers[j][k].calculateDerived();
	1516	// near identical ?
	1517	if ((testers[j][k].differencesSignificance == 0) &&
	1518	(Utils.eq(testers[j][k].differencesStats.mean, 0.0) \|\|
	1519	(Utils.gr(m_delta, Math.abs(testers[j][k].
	1520	differencesStats.mean))))) {
	1521	// if they're exactly the same and there is a base set
	1522	// in this race, make sure that the base set is NOT the
	1523	// one eliminated.
	1524	if (Utils.eq(testers[j][k].differencesStats.mean, 0.0)) {
	1525
	1526	if (baseSetIncluded) {
	1527	if (j != 0) {
	1528	eliminated[j] = true;
	1529	} else {
	1530	eliminated[k] = true;
	1531	}
	1532	eliminatedCount++;
	1533	} else {
	1534	eliminated[j] = true;
	1535	}
	1536	if (m_debug) {
	1537	System.err.println("Eliminating (identical) "
	1538	+j+" "+raceBitSets[j].toString()
	1539	+" vs "+k+" "
	1540	+raceBitSets[k].toString()
	1541	+" after "
	1542	+processedCount
	1543	+" evaluations\n"
	1544	+"\nerror "+j+" : "
	1545	+testers[j][k].xStats.mean
	1546	+" vs "+k+" : "
	1547	+testers[j][k].yStats.mean
	1548	+" diff : "
	1549	+testers[j][k].differencesStats
	1550	.mean);
	1551	}
	1552	} else {
	1553	// eliminate the one with the higer error
	1554	if (testers[j][k].xStats.mean >
	1555	testers[j][k].yStats.mean) {
	1556	eliminated[j] = true;
	1557	eliminatedCount++;
	1558	if (m_debug) {
	1559	System.err.println("Eliminating (near identical) "
	1560	+j+" "+raceBitSets[j].toString()
	1561	+" vs "+k+" "
	1562	+raceBitSets[k].toString()
	1563	+" after "
	1564	+processedCount
	1565	+" evaluations\n"
	1566	+"\nerror "+j+" : "
	1567	+testers[j][k].xStats.mean
	1568	+" vs "+k+" : "
	1569	+testers[j][k].yStats.mean
	1570	+" diff : "
	1571	+testers[j][k].differencesStats
	1572	.mean);
	1573	}
	1574	break;
	1575	} else {
	1576	eliminated[k] = true;
	1577	eliminatedCount++;
	1578	if (m_debug) {
	1579	System.err.println("Eliminating (near identical) "
	1580	+k+" "+raceBitSets[k].toString()
	1581	+" vs "+j+" "
	1582	+raceBitSets[j].toString()
	1583	+" after "
	1584	+processedCount
	1585	+" evaluations\n"
	1586	+"\nerror "+k+" : "
	1587	+testers[j][k].yStats.mean
	1588	+" vs "+j+" : "
	1589	+testers[j][k].xStats.mean
	1590	+" diff : "
	1591	+testers[j][k].differencesStats
	1592	.mean);
	1593	}
	1594	}
	1595	}
	1596	} else {
	1597	// significantly worse ?
	1598	if (testers[j][k].differencesSignificance != 0) {
	1599	if (testers[j][k].differencesSignificance > 0) {
	1600	eliminated[j] = true;
	1601	eliminatedCount++;
	1602	if (m_debug) {
	1603	System.err.println("Eliminating (-worse) "
	1604	+j+" "+raceBitSets[j].toString()
	1605	+" vs "+k+" "
	1606	+raceBitSets[k].toString()
	1607	+" after "
	1608	+processedCount
	1609	+" evaluations"
	1610	+"\nerror "+j+" : "
	1611	+testers[j][k].xStats.mean
	1612	+" vs "+k+" : "
	1613	+testers[j][k].yStats.mean);
	1614	}
	1615	break;
	1616	} else {
	1617	eliminated[k] = true;
	1618	eliminatedCount++;
	1619	if (m_debug) {
	1620	System.err.println("Eliminating (worse) "
	1621	+k+" "+raceBitSets[k].toString()
	1622	+" vs "+j+" "
	1623	+raceBitSets[j].toString()
	1624	+" after "
	1625	+processedCount
	1626	+" evaluations"
	1627	+"\nerror "+k+" : "
	1628	+testers[j][k].yStats.mean
	1629	+" vs "+j+" : "
	1630	+testers[j][k].xStats.mean);
	1631	}
	1632	}
	1633	}
	1634	}
	1635	}
	1636	}
	1637	}
	1638	}
	1639	}
	1640	// if there is a base set from the previous race and it's the
	1641	// only remaining subset then terminate the race.
	1642	if (eliminatedCount == raceSets.length-1 && baseSetIncluded &&
	1643	!eliminated[0] && !m_rankingRequested) {
	1644	break race;
	1645	}
	1646	}
	1647	}
	1648
	1649	if (m_debug) {
	1650	System.err.println("*****eliminated count: "+eliminatedCount);
	1651	}
	1652	double bestError = Double.MAX_VALUE;
	1653	int bestIndex=0;
	1654	// return the index of the winner
	1655	for (int i=startPt;i<raceSets.length;i++) {
	1656	if (!eliminated[i]) {
	1657	individualStats[i].calculateDerived();
	1658	if (m_debug) {
	1659	System.err.println("Remaining error: "+raceBitSets[i].toString()
	1660	+" "+individualStats[i].mean);
	1661	}
	1662	if (individualStats[i].mean < bestError) {
	1663	bestError = individualStats[i].mean;
	1664	bestIndex = i;
	1665	}
	1666	}
	1667	}
	1668
	1669	double [] retInfo = new double[2];
	1670	retInfo[0] = bestIndex;
	1671	retInfo[1] = bestError;
	1672
	1673	if (m_debug) {
	1674	System.err.print("Best set from race : ");
	1675
	1676	for (int i=0;i<m_numAttribs;i++) {
	1677	if (raceSets[bestIndex][i] == '1') {
	1678	System.err.print('1');
	1679	} else {
	1680	System.err.print('0');
	1681	}
	1682	}
	1683	System.err.println(" :"+bestError+" Processed : "+(processedCount)
	1684	+"\n"+individualStats[bestIndex].toString());
	1685	}
	1686	return retInfo;
	1687	}
	1688
	1689	/**
	1690	* Returns a string represenation
	1691	*
	1692	* @return a string representation
	1693	*/
	1694	public String toString() {
	1695	StringBuffer text = new StringBuffer();
	1696
	1697	text.append("\tRaceSearch.\n\tRace type : ");
	1698	switch (m_raceType) {
	1699	case FORWARD_RACE:
	1700	text.append("forward selection race\n\tBase set : no attributes");
	1701	break;
	1702	case BACKWARD_RACE:
	1703	text.append("backward elimination race\n\tBase set : all attributes");
	1704	break;
	1705	case SCHEMATA_RACE:
	1706	text.append("schemata race\n\tBase set : no attributes");
	1707	break;
	1708	case RANK_RACE:
	1709	text.append("rank race\n\tBase set : no attributes\n\t");
	1710	text.append("Attribute evaluator : "
	1711	+ getAttributeEvaluator().getClass().getName() +" ");
	1712	if (m_ASEval instanceof OptionHandler) {
	1713	String[] evaluatorOptions = new String[0];
	1714	evaluatorOptions = ((OptionHandler)m_ASEval).getOptions();
	1715	for (int i=0;i<evaluatorOptions.length;i++) {
	1716	text.append(evaluatorOptions[i]+' ');
	1717	}
	1718	}
	1719	text.append("\n");
	1720	text.append("\tAttribute ranking : \n");
	1721	int rlength = (int)(Math.log(m_Ranking.length) / Math.log(10) + 1);
	1722	for (int i=0;i<m_Ranking.length;i++) {
	1723	text.append("\t "+Utils.doubleToString((double)(m_Ranking[i]+1),
	1724	rlength,0)
	1725	+" "+m_Instances.attribute(m_Ranking[i]).name()+'\n');
	1726	}
	1727	break;
	1728	}
	1729	text.append("\n\tCross validation mode : ");
	1730	if (m_xvalType == TEN_FOLD) {
	1731	text.append("10 fold");
	1732	} else {
	1733	text.append("Leave-one-out");
	1734	}
	1735
	1736	text.append("\n\tMerit of best subset found : ");
	1737	int fieldwidth = 3;
	1738	double precision = (m_bestMerit - (int)m_bestMerit);
	1739	if (Math.abs(m_bestMerit) > 0) {
	1740	fieldwidth = (int)Math.abs((Math.log(Math.abs(m_bestMerit)) /
	1741	Math.log(10)))+2;
	1742	}
	1743	if (Math.abs(precision) > 0) {
	1744	precision = Math.abs((Math.log(Math.abs(precision)) / Math.log(10)))+3;
	1745	} else {
	1746	precision = 2;
	1747	}
	1748
	1749	text.append(Utils.doubleToString(Math.abs(m_bestMerit),
	1750	fieldwidth+(int)precision,
	1751	(int)precision)+"\n");
	1752	return text.toString();
	1753
	1754	}
	1755
	1756	/**
	1757	* Reset the search method.
	1758	*/
	1759	protected void resetOptions () {
	1760	m_sigLevel = 0.001;
	1761	m_delta = 0.001;
	1762	m_ASEval = new GainRatioAttributeEval();
	1763	m_Ranking = null;
	1764	m_raceType = FORWARD_RACE;
	1765	m_debug = false;
	1766	m_theEvaluator = null;
	1767	m_bestMerit = -Double.MAX_VALUE;
	1768	m_numFolds = 10;
	1769	}
	1770
	1771	/**
	1772	* Returns the revision string.
	1773	*
	1774	* @return the revision
	1775	*/
	1776	public String getRevision() {
	1777	return RevisionUtils.extract("$Revision: 1.26 $");
	1778	}
	1779	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: