Context Navigation

source: src/main/java/weka/classifiers/BVDecomposeSegCVSub.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 37.2 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* BVDecomposeSegCVSub.java
	19	* Copyright (C) 2003 Paul Conilione
	20	*
	21	* Based on the class: BVDecompose.java by Len Trigg (1999)
	22	*/
	23
	24
	25	/*
	26	* DEDICATION
	27	*
	28	* Paul Conilione would like to express his deep gratitude and appreciation
	29	* to his Chinese Buddhist Taoist Master Sifu Chow Yuk Nen for the abilities
	30	* and insight that he has been taught, which have allowed him to program in
	31	* a clear and efficient manner.
	32	*
	33	* Master Sifu Chow Yuk Nen's Teachings are unique and precious. They are
	34	* applicable to any field of human endeavour. Through his unique and powerful
	35	* ability to skilfully apply Chinese Buddhist Teachings, people have achieved
	36	* success in; Computing, chemical engineering, business, accounting, philosophy
	37	* and more.
	38	*
	39	*/
	40
	41	package weka.classifiers;
	42
	43	import weka.core.Attribute;
	44	import weka.core.Instance;
	45	import weka.core.Instances;
	46	import weka.core.Option;
	47	import weka.core.OptionHandler;
	48	import weka.core.RevisionHandler;
	49	import weka.core.RevisionUtils;
	50	import weka.core.TechnicalInformation;
	51	import weka.core.TechnicalInformationHandler;
	52	import weka.core.Utils;
	53	import weka.core.TechnicalInformation.Field;
	54	import weka.core.TechnicalInformation.Type;
	55
	56	import java.io.BufferedReader;
	57	import java.io.FileReader;
	58	import java.io.Reader;
	59	import java.util.Enumeration;
	60	import java.util.Random;
	61	import java.util.Vector;
	62
	63	/**
	64	<!-- globalinfo-start -->
	65	* This class performs Bias-Variance decomposion on any classifier using the sub-sampled cross-validation procedure as specified in (1).<br/>
	66	* The Kohavi and Wolpert definition of bias and variance is specified in (2).<br/>
	67	* The Webb definition of bias and variance is specified in (3).<br/>
	68	* <br/>
	69	* Geoffrey I. Webb, Paul Conilione (2002). Estimating bias and variance from data. School of Computer Science and Software Engineering, Victoria, Australia.<br/>
	70	* <br/>
	71	* Ron Kohavi, David H. Wolpert: Bias Plus Variance Decomposition for Zero-One Loss Functions. In: Machine Learning: Proceedings of the Thirteenth International Conference, 275-283, 1996.<br/>
	72	* <br/>
	73	* Geoffrey I. Webb (2000). MultiBoosting: A Technique for Combining Boosting and Wagging. Machine Learning. 40(2):159-196.
	74	* <p/>
	75	<!-- globalinfo-end -->
	76	*
	77	<!-- technical-bibtex-start -->
	78	* BibTeX:
	79	* <pre>
	80	* @misc{Webb2002,
	81	* address = {School of Computer Science and Software Engineering, Victoria, Australia},
	82	* author = {Geoffrey I. Webb and Paul Conilione},
	83	* institution = {Monash University},
	84	* title = {Estimating bias and variance from data},
	85	* year = {2002},
	86	* PDF = {http://www.csse.monash.edu.au/\~webb/Files/WebbConilione04.pdf}
	87	* }
	88	*
	89	* @inproceedings{Kohavi1996,
	90	* author = {Ron Kohavi and David H. Wolpert},
	91	* booktitle = {Machine Learning: Proceedings of the Thirteenth International Conference},
	92	* editor = {Lorenza Saitta},
	93	* pages = {275-283},
	94	* publisher = {Morgan Kaufmann},
	95	* title = {Bias Plus Variance Decomposition for Zero-One Loss Functions},
	96	* year = {1996},
	97	* PS = {http://robotics.stanford.edu/\~ronnyk/biasVar.ps}
	98	* }
	99	*
	100	* @article{Webb2000,
	101	* author = {Geoffrey I. Webb},
	102	* journal = {Machine Learning},
	103	* number = {2},
	104	* pages = {159-196},
	105	* title = {MultiBoosting: A Technique for Combining Boosting and Wagging},
	106	* volume = {40},
	107	* year = {2000}
	108	* }
	109	* </pre>
	110	* <p/>
	111	<!-- technical-bibtex-end -->
	112	*
	113	<!-- options-start -->
	114	* Valid options are: <p/>
	115	*
	116	* <pre> -c <class index>
	117	* The index of the class attribute.
	118	* (default last)</pre>
	119	*
	120	* <pre> -D
	121	* Turn on debugging output.</pre>
	122	*
	123	* <pre> -l <num>
	124	* The number of times each instance is classified.
	125	* (default 10)</pre>
	126	*
	127	* <pre> -p <proportion of objects in common>
	128	* The average proportion of instances common between any two training sets</pre>
	129	*
	130	* <pre> -s <seed>
	131	* The random number seed used.</pre>
	132	*
	133	* <pre> -t <name of arff file>
	134	* The name of the arff file used for the decomposition.</pre>
	135	*
	136	* <pre> -T <number of instances in training set>
	137	* The number of instances in the training set.</pre>
	138	*
	139	* <pre> -W <classifier class name>
	140	* Full class name of the learner used in the decomposition.
	141	* eg: weka.classifiers.bayes.NaiveBayes</pre>
	142	*
	143	* <pre>
	144	* Options specific to learner weka.classifiers.rules.ZeroR:
	145	* </pre>
	146	*
	147	* <pre> -D
	148	* If set, classifier is run in debug mode and
	149	* may output additional info to the console</pre>
	150	*
	151	<!-- options-end -->
	152	*
	153	* Options after -- are passed to the designated sub-learner. <p>
	154	*
	155	* @author Paul Conilione (paulc4321@yahoo.com.au)
	156	* @version $Revision: 6041 $
	157	*/
	158	public class BVDecomposeSegCVSub
	159	implements OptionHandler, TechnicalInformationHandler, RevisionHandler {
	160
	161	/** Debugging mode, gives extra output if true. */
	162	protected boolean m_Debug;
	163
	164	/** An instantiated base classifier used for getting and testing options. */
	165	protected Classifier m_Classifier = new weka.classifiers.rules.ZeroR();
	166
	167	/** The options to be passed to the base classifier. */
	168	protected String [] m_ClassifierOptions;
	169
	170	/** The number of times an instance is classified*/
	171	protected int m_ClassifyIterations;
	172
	173	/** The name of the data file used for the decomposition */
	174	protected String m_DataFileName;
	175
	176	/** The index of the class attribute */
	177	protected int m_ClassIndex = -1;
	178
	179	/** The random number seed */
	180	protected int m_Seed = 1;
	181
	182	/** The calculated Kohavi & Wolpert bias (squared) */
	183	protected double m_KWBias;
	184
	185	/** The calculated Kohavi & Wolpert variance */
	186	protected double m_KWVariance;
	187
	188	/** The calculated Kohavi & Wolpert sigma */
	189	protected double m_KWSigma;
	190
	191	/** The calculated Webb bias */
	192	protected double m_WBias;
	193
	194	/** The calculated Webb variance */
	195	protected double m_WVariance;
	196
	197	/** The error rate */
	198	protected double m_Error;
	199
	200	/** The training set size */
	201	protected int m_TrainSize;
	202
	203	/** Proportion of instances common between any two training sets. */
	204	protected double m_P;
	205
	206	/**
	207	* Returns a string describing this object
	208	* @return a description of the classifier suitable for
	209	* displaying in the explorer/experimenter gui
	210	*/
	211	public String globalInfo() {
	212	return
	213	"This class performs Bias-Variance decomposion on any classifier using the "
	214	+ "sub-sampled cross-validation procedure as specified in (1).\n"
	215	+ "The Kohavi and Wolpert definition of bias and variance is specified in (2).\n"
	216	+ "The Webb definition of bias and variance is specified in (3).\n\n"
	217	+ getTechnicalInformation().toString();
	218	}
	219
	220	/**
	221	* Returns an instance of a TechnicalInformation object, containing
	222	* detailed information about the technical background of this class,
	223	* e.g., paper reference or book this class is based on.
	224	*
	225	* @return the technical information about this class
	226	*/
	227	public TechnicalInformation getTechnicalInformation() {
	228	TechnicalInformation result;
	229	TechnicalInformation additional;
	230
	231	result = new TechnicalInformation(Type.MISC);
	232	result.setValue(Field.AUTHOR, "Geoffrey I. Webb and Paul Conilione");
	233	result.setValue(Field.YEAR, "2002");
	234	result.setValue(Field.TITLE, "Estimating bias and variance from data");
	235	result.setValue(Field.INSTITUTION, "Monash University");
	236	result.setValue(Field.ADDRESS, "School of Computer Science and Software Engineering, Victoria, Australia");
	237	result.setValue(Field.PDF, "http://www.csse.monash.edu.au/~webb/Files/WebbConilione04.pdf");
	238
	239	additional = result.add(Type.INPROCEEDINGS);
	240	additional.setValue(Field.AUTHOR, "Ron Kohavi and David H. Wolpert");
	241	additional.setValue(Field.YEAR, "1996");
	242	additional.setValue(Field.TITLE, "Bias Plus Variance Decomposition for Zero-One Loss Functions");
	243	additional.setValue(Field.BOOKTITLE, "Machine Learning: Proceedings of the Thirteenth International Conference");
	244	additional.setValue(Field.PUBLISHER, "Morgan Kaufmann");
	245	additional.setValue(Field.EDITOR, "Lorenza Saitta");
	246	additional.setValue(Field.PAGES, "275-283");
	247	additional.setValue(Field.PS, "http://robotics.stanford.edu/~ronnyk/biasVar.ps");
	248
	249	additional = result.add(Type.ARTICLE);
	250	additional.setValue(Field.AUTHOR, "Geoffrey I. Webb");
	251	additional.setValue(Field.YEAR, "2000");
	252	additional.setValue(Field.TITLE, "MultiBoosting: A Technique for Combining Boosting and Wagging");
	253	additional.setValue(Field.JOURNAL, "Machine Learning");
	254	additional.setValue(Field.VOLUME, "40");
	255	additional.setValue(Field.NUMBER, "2");
	256	additional.setValue(Field.PAGES, "159-196");
	257
	258	return result;
	259	}
	260
	261	/**
	262	* Returns an enumeration describing the available options.
	263	*
	264	* @return an enumeration of all the available options.
	265	*/
	266	public Enumeration listOptions() {
	267
	268	Vector newVector = new Vector(8);
	269
	270	newVector.addElement(new Option(
	271	"\tThe index of the class attribute.\n"+
	272	"\t(default last)",
	273	"c", 1, "-c <class index>"));
	274	newVector.addElement(new Option(
	275	"\tTurn on debugging output.",
	276	"D", 0, "-D"));
	277	newVector.addElement(new Option(
	278	"\tThe number of times each instance is classified.\n"
	279	+"\t(default 10)",
	280	"l", 1, "-l <num>"));
	281	newVector.addElement(new Option(
	282	"\tThe average proportion of instances common between any two training sets",
	283	"p", 1, "-p <proportion of objects in common>"));
	284	newVector.addElement(new Option(
	285	"\tThe random number seed used.",
	286	"s", 1, "-s <seed>"));
	287	newVector.addElement(new Option(
	288	"\tThe name of the arff file used for the decomposition.",
	289	"t", 1, "-t <name of arff file>"));
	290	newVector.addElement(new Option(
	291	"\tThe number of instances in the training set.",
	292	"T", 1, "-T <number of instances in training set>"));
	293	newVector.addElement(new Option(
	294	"\tFull class name of the learner used in the decomposition.\n"
	295	+"\teg: weka.classifiers.bayes.NaiveBayes",
	296	"W", 1, "-W <classifier class name>"));
	297
	298	if ((m_Classifier != null) &&
	299	(m_Classifier instanceof OptionHandler)) {
	300	newVector.addElement(new Option(
	301	"",
	302	"", 0, "\nOptions specific to learner "
	303	+ m_Classifier.getClass().getName()
	304	+ ":"));
	305	Enumeration enu = ((OptionHandler)m_Classifier).listOptions();
	306	while (enu.hasMoreElements()) {
	307	newVector.addElement(enu.nextElement());
	308	}
	309	}
	310	return newVector.elements();
	311	}
	312
	313
	314	/**
	315	* Sets the OptionHandler's options using the given list. All options
	316	* will be set (or reset) during this call (i.e. incremental setting
	317	* of options is not possible). <p/>
	318	*
	319	<!-- options-start -->
	320	* Valid options are: <p/>
	321	*
	322	* <pre> -c <class index>
	323	* The index of the class attribute.
	324	* (default last)</pre>
	325	*
	326	* <pre> -D
	327	* Turn on debugging output.</pre>
	328	*
	329	* <pre> -l <num>
	330	* The number of times each instance is classified.
	331	* (default 10)</pre>
	332	*
	333	* <pre> -p <proportion of objects in common>
	334	* The average proportion of instances common between any two training sets</pre>
	335	*
	336	* <pre> -s <seed>
	337	* The random number seed used.</pre>
	338	*
	339	* <pre> -t <name of arff file>
	340	* The name of the arff file used for the decomposition.</pre>
	341	*
	342	* <pre> -T <number of instances in training set>
	343	* The number of instances in the training set.</pre>
	344	*
	345	* <pre> -W <classifier class name>
	346	* Full class name of the learner used in the decomposition.
	347	* eg: weka.classifiers.bayes.NaiveBayes</pre>
	348	*
	349	* <pre>
	350	* Options specific to learner weka.classifiers.rules.ZeroR:
	351	* </pre>
	352	*
	353	* <pre> -D
	354	* If set, classifier is run in debug mode and
	355	* may output additional info to the console</pre>
	356	*
	357	<!-- options-end -->
	358	*
	359	* @param options the list of options as an array of strings
	360	* @throws Exception if an option is not supported
	361	*/
	362	public void setOptions(String[] options) throws Exception {
	363	setDebug(Utils.getFlag('D', options));
	364
	365	String classIndex = Utils.getOption('c', options);
	366	if (classIndex.length() != 0) {
	367	if (classIndex.toLowerCase().equals("last")) {
	368	setClassIndex(0);
	369	} else if (classIndex.toLowerCase().equals("first")) {
	370	setClassIndex(1);
	371	} else {
	372	setClassIndex(Integer.parseInt(classIndex));
	373	}
	374	} else {
	375	setClassIndex(0);
	376	}
	377
	378	String classifyIterations = Utils.getOption('l', options);
	379	if (classifyIterations.length() != 0) {
	380	setClassifyIterations(Integer.parseInt(classifyIterations));
	381	} else {
	382	setClassifyIterations(10);
	383	}
	384
	385	String prob = Utils.getOption('p', options);
	386	if (prob.length() != 0) {
	387	setP( Double.parseDouble(prob));
	388	} else {
	389	setP(-1);
	390	}
	391	//throw new Exception("A proportion must be specified" + " with a -p option.");
	392
	393	String seedString = Utils.getOption('s', options);
	394	if (seedString.length() != 0) {
	395	setSeed(Integer.parseInt(seedString));
	396	} else {
	397	setSeed(1);
	398	}
	399
	400	String dataFile = Utils.getOption('t', options);
	401	if (dataFile.length() != 0) {
	402	setDataFileName(dataFile);
	403	} else {
	404	throw new Exception("An arff file must be specified"
	405	+ " with the -t option.");
	406	}
	407
	408	String trainSize = Utils.getOption('T', options);
	409	if (trainSize.length() != 0) {
	410	setTrainSize(Integer.parseInt(trainSize));
	411	} else {
	412	setTrainSize(-1);
	413	}
	414	//throw new Exception("A training set size must be specified" + " with a -T option.");
	415
	416	String classifierName = Utils.getOption('W', options);
	417	if (classifierName.length() != 0) {
	418	setClassifier(AbstractClassifier.forName(classifierName, Utils.partitionOptions(options)));
	419	} else {
	420	throw new Exception("A learner must be specified with the -W option.");
	421	}
	422	}
	423
	424	/**
	425	* Gets the current settings of the CheckClassifier.
	426	*
	427	* @return an array of strings suitable for passing to setOptions
	428	*/
	429	public String [] getOptions() {
	430
	431	String [] classifierOptions = new String [0];
	432	if ((m_Classifier != null) &&
	433	(m_Classifier instanceof OptionHandler)) {
	434	classifierOptions = ((OptionHandler)m_Classifier).getOptions();
	435	}
	436	String [] options = new String [classifierOptions.length + 14];
	437	int current = 0;
	438	if (getDebug()) {
	439	options[current++] = "-D";
	440	}
	441	options[current++] = "-c"; options[current++] = "" + getClassIndex();
	442	options[current++] = "-l"; options[current++] = "" + getClassifyIterations();
	443	options[current++] = "-p"; options[current++] = "" + getP();
	444	options[current++] = "-s"; options[current++] = "" + getSeed();
	445	if (getDataFileName() != null) {
	446	options[current++] = "-t"; options[current++] = "" + getDataFileName();
	447	}
	448	options[current++] = "-T"; options[current++] = "" + getTrainSize();
	449	if (getClassifier() != null) {
	450	options[current++] = "-W";
	451	options[current++] = getClassifier().getClass().getName();
	452	}
	453
	454	options[current++] = "--";
	455	System.arraycopy(classifierOptions, 0, options, current,
	456	classifierOptions.length);
	457	current += classifierOptions.length;
	458	while (current < options.length) {
	459	options[current++] = "";
	460	}
	461	return options;
	462	}
	463
	464	/**
	465	* Set the classifiers being analysed
	466	*
	467	* @param newClassifier the Classifier to use.
	468	*/
	469	public void setClassifier(Classifier newClassifier) {
	470
	471	m_Classifier = newClassifier;
	472	}
	473
	474	/**
	475	* Gets the name of the classifier being analysed
	476	*
	477	* @return the classifier being analysed.
	478	*/
	479	public Classifier getClassifier() {
	480
	481	return m_Classifier;
	482	}
	483
	484	/**
	485	* Sets debugging mode
	486	*
	487	* @param debug true if debug output should be printed
	488	*/
	489	public void setDebug(boolean debug) {
	490
	491	m_Debug = debug;
	492	}
	493
	494	/**
	495	* Gets whether debugging is turned on
	496	*
	497	* @return true if debugging output is on
	498	*/
	499	public boolean getDebug() {
	500
	501	return m_Debug;
	502	}
	503
	504
	505	/**
	506	* Sets the random number seed
	507	*
	508	* @param seed the random number seed
	509	*/
	510	public void setSeed(int seed) {
	511
	512	m_Seed = seed;
	513	}
	514
	515	/**
	516	* Gets the random number seed
	517	*
	518	* @return the random number seed
	519	*/
	520	public int getSeed() {
	521
	522	return m_Seed;
	523	}
	524
	525	/**
	526	* Sets the number of times an instance is classified
	527	*
	528	* @param classifyIterations number of times an instance is classified
	529	*/
	530	public void setClassifyIterations(int classifyIterations) {
	531
	532	m_ClassifyIterations = classifyIterations;
	533	}
	534
	535	/**
	536	* Gets the number of times an instance is classified
	537	*
	538	* @return the maximum number of times an instance is classified
	539	*/
	540	public int getClassifyIterations() {
	541
	542	return m_ClassifyIterations;
	543	}
	544
	545	/**
	546	* Sets the name of the dataset file.
	547	*
	548	* @param dataFileName name of dataset file.
	549	*/
	550	public void setDataFileName(String dataFileName) {
	551
	552	m_DataFileName = dataFileName;
	553	}
	554
	555	/**
	556	* Get the name of the data file used for the decomposition
	557	*
	558	* @return the name of the data file
	559	*/
	560	public String getDataFileName() {
	561
	562	return m_DataFileName;
	563	}
	564
	565	/**
	566	* Get the index (starting from 1) of the attribute used as the class.
	567	*
	568	* @return the index of the class attribute
	569	*/
	570	public int getClassIndex() {
	571
	572	return m_ClassIndex + 1;
	573	}
	574
	575	/**
	576	* Sets index of attribute to discretize on
	577	*
	578	* @param classIndex the index (starting from 1) of the class attribute
	579	*/
	580	public void setClassIndex(int classIndex) {
	581
	582	m_ClassIndex = classIndex - 1;
	583	}
	584
	585	/**
	586	* Get the calculated bias squared according to the Kohavi and Wolpert definition
	587	*
	588	* @return the bias squared
	589	*/
	590	public double getKWBias() {
	591
	592	return m_KWBias;
	593	}
	594
	595	/**
	596	* Get the calculated bias according to the Webb definition
	597	*
	598	* @return the bias
	599	*
	600	*/
	601	public double getWBias() {
	602
	603	return m_WBias;
	604	}
	605
	606
	607	/**
	608	* Get the calculated variance according to the Kohavi and Wolpert definition
	609	*
	610	* @return the variance
	611	*/
	612	public double getKWVariance() {
	613
	614	return m_KWVariance;
	615	}
	616
	617	/**
	618	* Get the calculated variance according to the Webb definition
	619	*
	620	* @return the variance according to Webb
	621	*
	622	*/
	623	public double getWVariance() {
	624
	625	return m_WVariance;
	626	}
	627
	628	/**
	629	* Get the calculated sigma according to the Kohavi and Wolpert definition
	630	*
	631	* @return the sigma
	632	*
	633	*/
	634	public double getKWSigma() {
	635
	636	return m_KWSigma;
	637	}
	638
	639	/**
	640	* Set the training size.
	641	*
	642	* @param size the size of the training set
	643	*
	644	*/
	645	public void setTrainSize(int size) {
	646
	647	m_TrainSize = size;
	648	}
	649
	650	/**
	651	* Get the training size
	652	*
	653	* @return the size of the training set
	654	*
	655	*/
	656	public int getTrainSize() {
	657
	658	return m_TrainSize;
	659	}
	660
	661	/**
	662	* Set the proportion of instances that are common between two training sets
	663	* used to train a classifier.
	664	*
	665	* @param proportion the proportion of instances that are common between training
	666	* sets.
	667	*
	668	*/
	669	public void setP(double proportion) {
	670
	671	m_P = proportion;
	672	}
	673
	674	/**
	675	* Get the proportion of instances that are common between two training sets.
	676	*
	677	* @return the proportion
	678	*
	679	*/
	680	public double getP() {
	681
	682	return m_P;
	683	}
	684
	685	/**
	686	* Get the calculated error rate
	687	*
	688	* @return the error rate
	689	*/
	690	public double getError() {
	691
	692	return m_Error;
	693	}
	694
	695	/**
	696	* Carry out the bias-variance decomposition using the sub-sampled cross-validation method.
	697	*
	698	* @throws Exception if the decomposition couldn't be carried out
	699	*/
	700	public void decompose() throws Exception {
	701
	702	Reader dataReader;
	703	Instances data;
	704
	705	int tps; // training pool size, size of segment E.
	706	int k; // number of folds in segment E.
	707	int q; // number of segments of size tps.
	708
	709	dataReader = new BufferedReader(new FileReader(m_DataFileName)); //open file
	710	data = new Instances(dataReader); // encapsulate in wrapper class called weka.Instances()
	711
	712	if (m_ClassIndex < 0) {
	713	data.setClassIndex(data.numAttributes() - 1);
	714	} else {
	715	data.setClassIndex(m_ClassIndex);
	716	}
	717
	718	if (data.classAttribute().type() != Attribute.NOMINAL) {
	719	throw new Exception("Class attribute must be nominal");
	720	}
	721	int numClasses = data.numClasses();
	722
	723	data.deleteWithMissingClass();
	724	if ( data.checkForStringAttributes() ) {
	725	throw new Exception("Can't handle string attributes!");
	726	}
	727
	728	// Dataset size must be greater than 2
	729	if ( data.numInstances() <= 2 ){
	730	throw new Exception("Dataset size must be greater than 2.");
	731	}
	732
	733	if ( m_TrainSize == -1 ){ // default value
	734	m_TrainSize = (int) Math.floor( (double) data.numInstances() / 2.0 );
	735	}else if ( m_TrainSize < 0 \|\| m_TrainSize >= data.numInstances() - 1 ) { // Check if 0 < training Size < D - 1
	736	throw new Exception("Training set size of "+m_TrainSize+" is invalid.");
	737	}
	738
	739	if ( m_P == -1 ){ // default value
	740	m_P = (double) m_TrainSize / ( (double)data.numInstances() - 1 );
	741	}else if ( m_P < ( m_TrainSize / ( (double)data.numInstances() - 1 ) ) \|\| m_P >= 1.0 ) { //Check if p is in range: m/(\|D\|-1) <= p < 1.0
	742	throw new Exception("Proportion is not in range: "+ (m_TrainSize / ((double) data.numInstances() - 1 )) +" <= p < 1.0 ");
	743	}
	744
	745	//roundup tps from double to integer
	746	tps = (int) Math.ceil( ((double)m_TrainSize / (double)m_P) + 1 );
	747	k = (int) Math.ceil( tps / (tps - (double) m_TrainSize));
	748
	749	// number of folds cannot be more than the number of instances in the training pool
	750	if ( k > tps ) {
	751	throw new Exception("The required number of folds is too many."
	752	+ "Change p or the size of the training set.");
	753	}
	754
	755	// calculate the number of segments, round down.
	756	q = (int) Math.floor( (double) data.numInstances() / (double)tps );
	757
	758	//create confusion matrix, columns = number of instances in data set, as all will be used, by rows = number of classes.
	759	double [][] instanceProbs = new double [data.numInstances()][numClasses];
	760	int [][] foldIndex = new int [ k ][ 2 ];
	761	Vector segmentList = new Vector(q + 1);
	762
	763	//Set random seed
	764	Random random = new Random(m_Seed);
	765
	766	data.randomize(random);
	767
	768	//create index arrays for different segments
	769
	770	int currentDataIndex = 0;
	771
	772	for( int count = 1; count <= (q + 1); count++ ){
	773	if( count > q){
	774	int [] segmentIndex = new int [ (data.numInstances() - (q * tps)) ];
	775	for(int index = 0; index < segmentIndex.length; index++, currentDataIndex++){
	776
	777	segmentIndex[index] = currentDataIndex;
	778	}
	779	segmentList.add(segmentIndex);
	780	} else {
	781	int [] segmentIndex = new int [ tps ];
	782
	783	for(int index = 0; index < segmentIndex.length; index++, currentDataIndex++){
	784	segmentIndex[index] = currentDataIndex;
	785	}
	786	segmentList.add(segmentIndex);
	787	}
	788	}
	789
	790	int remainder = tps % k; // remainder is used to determine when to shrink the fold size by 1.
	791
	792	//foldSize = ROUNDUP( tps / k ) (round up, eg 3 -> 3, 3.3->4)
	793	int foldSize = (int) Math.ceil( (double)tps /(double) k); //roundup fold size double to integer
	794	int index = 0;
	795	int currentIndex;
	796
	797	for( int count = 0; count < k; count ++){
	798	if( remainder != 0 && count == remainder ){
	799	foldSize -= 1;
	800	}
	801	foldIndex[count][0] = index;
	802	foldIndex[count][1] = foldSize;
	803	index += foldSize;
	804	}
	805
	806	for( int l = 0; l < m_ClassifyIterations; l++) {
	807
	808	for(int i = 1; i <= q; i++){
	809
	810	int [] currentSegment = (int[]) segmentList.get(i - 1);
	811
	812	randomize(currentSegment, random);
	813
	814	//CROSS FOLD VALIDATION for current Segment
	815	for( int j = 1; j <= k; j++){
	816
	817	Instances TP = null;
	818	for(int foldNum = 1; foldNum <= k; foldNum++){
	819	if( foldNum != j){
	820
	821	int startFoldIndex = foldIndex[ foldNum - 1 ][ 0 ]; //start index
	822	foldSize = foldIndex[ foldNum - 1 ][ 1 ];
	823	int endFoldIndex = startFoldIndex + foldSize - 1;
	824
	825	for(int currentFoldIndex = startFoldIndex; currentFoldIndex <= endFoldIndex; currentFoldIndex++){
	826
	827	if( TP == null ){
	828	TP = new Instances(data, currentSegment[ currentFoldIndex ], 1);
	829	}else{
	830	TP.add( data.instance( currentSegment[ currentFoldIndex ] ) );
	831	}
	832	}
	833	}
	834	}
	835
	836	TP.randomize(random);
	837
	838	if( getTrainSize() > TP.numInstances() ){
	839	throw new Exception("The training set size of " + getTrainSize() + ", is greater than the training pool "
	840	+ TP.numInstances() );
	841	}
	842
	843	Instances train = new Instances(TP, 0, m_TrainSize);
	844
	845	Classifier current = AbstractClassifier.makeCopy(m_Classifier);
	846	current.buildClassifier(train); // create a clssifier using the instances in train.
	847
	848	int currentTestIndex = foldIndex[ j - 1 ][ 0 ]; //start index
	849	int testFoldSize = foldIndex[ j - 1 ][ 1 ]; //size
	850	int endTestIndex = currentTestIndex + testFoldSize - 1;
	851
	852	while( currentTestIndex <= endTestIndex ){
	853
	854	Instance testInst = data.instance( currentSegment[currentTestIndex] );
	855	int pred = (int)current.classifyInstance( testInst );
	856
	857
	858	if(pred != testInst.classValue()) {
	859	m_Error++; // add 1 to mis-classifications.
	860	}
	861	instanceProbs[ currentSegment[ currentTestIndex ] ][ pred ]++;
	862	currentTestIndex++;
	863	}
	864
	865	if( i == 1 && j == 1){
	866	int[] segmentElast = (int[])segmentList.lastElement();
	867	for( currentIndex = 0; currentIndex < segmentElast.length; currentIndex++){
	868	Instance testInst = data.instance( segmentElast[currentIndex] );
	869	int pred = (int)current.classifyInstance( testInst );
	870	if(pred != testInst.classValue()) {
	871	m_Error++; // add 1 to mis-classifications.
	872	}
	873
	874	instanceProbs[ segmentElast[ currentIndex ] ][ pred ]++;
	875	}
	876	}
	877	}
	878	}
	879	}
	880
	881	m_Error /= (double)( m_ClassifyIterations * data.numInstances() );
	882
	883	m_KWBias = 0.0;
	884	m_KWVariance = 0.0;
	885	m_KWSigma = 0.0;
	886
	887	m_WBias = 0.0;
	888	m_WVariance = 0.0;
	889
	890	for (int i = 0; i < data.numInstances(); i++) {
	891
	892	Instance current = data.instance( i );
	893
	894	double [] predProbs = instanceProbs[ i ];
	895	double pActual, pPred;
	896	double bsum = 0, vsum = 0, ssum = 0;
	897	double wBSum = 0, wVSum = 0;
	898
	899	Vector centralTendencies = findCentralTendencies( predProbs );
	900
	901	if( centralTendencies == null ){
	902	throw new Exception("Central tendency was null.");
	903	}
	904
	905	for (int j = 0; j < numClasses; j++) {
	906	pActual = (current.classValue() == j) ? 1 : 0;
	907	pPred = predProbs[j] / m_ClassifyIterations;
	908	bsum += (pActual - pPred) * (pActual - pPred) - pPred * (1 - pPred) / (m_ClassifyIterations - 1);
	909	vsum += pPred * pPred;
	910	ssum += pActual * pActual;
	911	}
	912
	913	m_KWBias += bsum;
	914	m_KWVariance += (1 - vsum);
	915	m_KWSigma += (1 - ssum);
	916
	917	for( int count = 0; count < centralTendencies.size(); count++ ) {
	918
	919	int wB = 0, wV = 0;
	920	int centralTendency = ((Integer)centralTendencies.get(count)).intValue();
	921
	922	// For a single instance xi, find the bias and variance.
	923	for (int j = 0; j < numClasses; j++) {
	924
	925	//Webb definition
	926	if( j != (int)current.classValue() && j == centralTendency ) {
	927	wB += predProbs[j];
	928	}
	929	if( j != (int)current.classValue() && j != centralTendency ) {
	930	wV += predProbs[j];
	931	}
	932
	933	}
	934	wBSum += (double) wB;
	935	wVSum += (double) wV;
	936	}
	937
	938	// calculate bais by dividing bSum by the number of central tendencies and
	939	// total number of instances. (effectively finding the average and dividing
	940	// by the number of instances to get the nominalised probability).
	941
	942	m_WBias += ( wBSum / ((double) ( centralTendencies.size() * m_ClassifyIterations )));
	943	// calculate variance by dividing vSum by the total number of interations
	944	m_WVariance += ( wVSum / ((double) ( centralTendencies.size() * m_ClassifyIterations )));
	945
	946	}
	947
	948	m_KWBias /= (2.0 * (double) data.numInstances());
	949	m_KWVariance /= (2.0 * (double) data.numInstances());
	950	m_KWSigma /= (2.0 * (double) data.numInstances());
	951
	952	// bias = bias / number of data instances
	953	m_WBias /= (double) data.numInstances();
	954	// variance = variance / number of data instances.
	955	m_WVariance /= (double) data.numInstances();
	956
	957	if (m_Debug) {
	958	System.err.println("Decomposition finished");
	959	}
	960
	961	}
	962
	963	/** Finds the central tendency, given the classifications for an instance.
	964	*
	965	* Where the central tendency is defined as the class that was most commonly
	966	* selected for a given instance.<p>
	967	*
	968	* For example, instance 'x' may be classified out of 3 classes y = {1, 2, 3},
	969	* so if x is classified 10 times, and is classified as follows, '1' = 2 times, '2' = 5 times
	970	* and '3' = 3 times. Then the central tendency is '2'. <p>
	971	*
	972	* However, it is important to note that this method returns a list of all classes
	973	* that have the highest number of classifications.
	974	*
	975	* In cases where there are several classes with the largest number of classifications, then
	976	* all of these classes are returned. For example if 'x' is classified '1' = 4 times,
	977	* '2' = 4 times and '3' = 2 times. Then '1' and '2' are returned.<p>
	978	*
	979	* @param predProbs the array of classifications for a single instance.
	980	*
	981	* @return a Vector containing Integer objects which store the class(s) which
	982	* are the central tendency.
	983	*/
	984	public Vector findCentralTendencies(double[] predProbs) {
	985
	986	int centralTValue = 0;
	987	int currentValue = 0;
	988	//array to store the list of classes the have the greatest number of classifictions.
	989	Vector centralTClasses;
	990
	991	centralTClasses = new Vector(); //create an array with size of the number of classes.
	992
	993	// Go through array, finding the central tendency.
	994	for( int i = 0; i < predProbs.length; i++) {
	995	currentValue = (int) predProbs[i];
	996	// if current value is greater than the central tendency value then
	997	// clear vector and add new class to vector array.
	998	if( currentValue > centralTValue) {
	999	centralTClasses.clear();
	1000	centralTClasses.addElement( new Integer(i) );
	1001	centralTValue = currentValue;
	1002	} else if( currentValue != 0 && currentValue == centralTValue) {
	1003	centralTClasses.addElement( new Integer(i) );
	1004	}
	1005	}
	1006	//return all classes that have the greatest number of classifications.
	1007	if( centralTValue != 0){
	1008	return centralTClasses;
	1009	} else {
	1010	return null;
	1011	}
	1012
	1013	}
	1014
	1015	/**
	1016	* Returns description of the bias-variance decomposition results.
	1017	*
	1018	* @return the bias-variance decomposition results as a string
	1019	*/
	1020	public String toString() {
	1021
	1022	String result = "\nBias-Variance Decomposition Segmentation, Cross Validation\n" +
	1023	"with subsampling.\n";
	1024
	1025	if (getClassifier() == null) {
	1026	return "Invalid setup";
	1027	}
	1028
	1029	result += "\nClassifier : " + getClassifier().getClass().getName();
	1030	if (getClassifier() instanceof OptionHandler) {
	1031	result += Utils.joinOptions(((OptionHandler)m_Classifier).getOptions());
	1032	}
	1033	result += "\nData File : " + getDataFileName();
	1034	result += "\nClass Index : ";
	1035	if (getClassIndex() == 0) {
	1036	result += "last";
	1037	} else {
	1038	result += getClassIndex();
	1039	}
	1040	result += "\nIterations : " + getClassifyIterations();
	1041	result += "\np : " + getP();
	1042	result += "\nTraining Size : " + getTrainSize();
	1043	result += "\nSeed : " + getSeed();
	1044
	1045	result += "\n\nDefinition : " +"Kohavi and Wolpert";
	1046	result += "\nError :" + Utils.doubleToString(getError(), 4);
	1047	result += "\nBias^2 :" + Utils.doubleToString(getKWBias(), 4);
	1048	result += "\nVariance :" + Utils.doubleToString(getKWVariance(), 4);
	1049	result += "\nSigma^2 :" + Utils.doubleToString(getKWSigma(), 4);
	1050
	1051	result += "\n\nDefinition : " +"Webb";
	1052	result += "\nError :" + Utils.doubleToString(getError(), 4);
	1053	result += "\nBias :" + Utils.doubleToString(getWBias(), 4);
	1054	result += "\nVariance :" + Utils.doubleToString(getWVariance(), 4);
	1055
	1056	return result;
	1057	}
	1058
	1059	/**
	1060	* Returns the revision string.
	1061	*
	1062	* @return the revision
	1063	*/
	1064	public String getRevision() {
	1065	return RevisionUtils.extract("$Revision: 6041 $");
	1066	}
	1067
	1068	/**
	1069	* Test method for this class
	1070	*
	1071	* @param args the command line arguments
	1072	*/
	1073	public static void main(String [] args) {
	1074
	1075	try {
	1076	BVDecomposeSegCVSub bvd = new BVDecomposeSegCVSub();
	1077
	1078	try {
	1079	bvd.setOptions(args);
	1080	Utils.checkForRemainingOptions(args);
	1081	} catch (Exception ex) {
	1082	String result = ex.getMessage() + "\nBVDecompose Options:\n\n";
	1083	Enumeration enu = bvd.listOptions();
	1084	while (enu.hasMoreElements()) {
	1085	Option option = (Option) enu.nextElement();
	1086	result += option.synopsis() + "\n" + option.description() + "\n";
	1087	}
	1088	throw new Exception(result);
	1089	}
	1090
	1091	bvd.decompose();
	1092
	1093	System.out.println(bvd.toString());
	1094
	1095	} catch (Exception ex) {
	1096	System.err.println(ex.getMessage());
	1097	}
	1098
	1099	}
	1100
	1101	/**
	1102	* Accepts an array of ints and randomises the values in the array, using the
	1103	* random seed.
	1104	*
	1105	*@param index is the array of integers
	1106	*@param random is the Random seed.
	1107	*/
	1108	public final void randomize(int[] index, Random random) {
	1109	for( int j = index.length - 1; j > 0; j-- ){
	1110	int k = random.nextInt( j + 1 );
	1111	int temp = index[j];
	1112	index[j] = index[k];
	1113	index[k] = temp;
	1114	}
	1115	}
	1116	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: