Context Navigation

source: src/main/java/weka/experiment/CostSensitiveClassifierSplitEvaluator.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 18.3 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* CostSensitiveClassifierSplitEvaluator.java
	19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23
	24	package weka.experiment;
	25
	26	import weka.classifiers.Classifier;
	27	import weka.classifiers.AbstractClassifier;
	28	import weka.classifiers.CostMatrix;
	29	import weka.classifiers.Evaluation;
	30	import weka.core.AdditionalMeasureProducer;
	31	import weka.core.Attribute;
	32	import weka.core.Instance;
	33	import weka.core.Instances;
	34	import weka.core.Option;
	35	import weka.core.RevisionUtils;
	36	import weka.core.Summarizable;
	37	import weka.core.Utils;
	38
	39	import java.io.BufferedReader;
	40	import java.io.ByteArrayOutputStream;
	41	import java.io.File;
	42	import java.io.FileReader;
	43	import java.io.ObjectOutputStream;
	44	import java.lang.management.ManagementFactory;
	45	import java.lang.management.ThreadMXBean;
	46	import java.util.Enumeration;
	47	import java.util.Vector;
	48
	49	/**
	50	<!-- globalinfo-start -->
	51	* SplitEvaluator that produces results for a classification scheme on a nominal class attribute, including weighted misclassification costs.
	52	* <p/>
	53	<!-- globalinfo-end -->
	54	*
	55	<!-- options-start -->
	56	* Valid options are: <p/>
	57	*
	58	* <pre> -W <class name>
	59	* The full class name of the classifier.
	60	* eg: weka.classifiers.bayes.NaiveBayes</pre>
	61	*
	62	* <pre> -C <index>
	63	* The index of the class for which IR statistics
	64	* are to be output. (default 1)</pre>
	65	*
	66	* <pre> -I <index>
	67	* The index of an attribute to output in the
	68	* results. This attribute should identify an
	69	* instance in order to know which instances are
	70	* in the test set of a cross validation. if 0
	71	* no output (default 0).</pre>
	72	*
	73	* <pre> -P
	74	* Add target and prediction columns to the result
	75	* for each fold.</pre>
	76	*
	77	* <pre>
	78	* Options specific to classifier weka.classifiers.rules.ZeroR:
	79	* </pre>
	80	*
	81	* <pre> -D
	82	* If set, classifier is run in debug mode and
	83	* may output additional info to the console</pre>
	84	*
	85	* <pre> -D <directory>
	86	* Name of a directory to search for cost files when loading
	87	* costs on demand (default current directory).</pre>
	88	*
	89	<!-- options-end -->
	90	*
	91	* All options after -- will be passed to the classifier.
	92	*
	93	* @author Len Trigg (len@reeltwo.com)
	94	* @version $Revision: 5987 $
	95	*/
	96	public class CostSensitiveClassifierSplitEvaluator
	97	extends ClassifierSplitEvaluator {
	98
	99	/** for serialization */
	100	static final long serialVersionUID = -8069566663019501276L;
	101
	102	/**
	103	* The directory used when loading cost files on demand, null indicates
	104	* current directory
	105	*/
	106	protected File m_OnDemandDirectory = new File(System.getProperty("user.dir"));
	107
	108	/** The length of a result */
	109	private static final int RESULT_SIZE = 31;
	110
	111	/**
	112	* Returns a string describing this split evaluator
	113	* @return a description of the split evaluator suitable for
	114	* displaying in the explorer/experimenter gui
	115	*/
	116	public String globalInfo() {
	117	return " SplitEvaluator that produces results for a classification scheme "
	118	+"on a nominal class attribute, including weighted misclassification "
	119	+"costs.";
	120	}
	121
	122	/**
	123	* Returns an enumeration describing the available options..
	124	*
	125	* @return an enumeration of all the available options.
	126	*/
	127	public Enumeration listOptions() {
	128
	129	Vector newVector = new Vector(1);
	130	Enumeration enu = super.listOptions();
	131	while (enu.hasMoreElements()) {
	132	newVector.addElement(enu.nextElement());
	133	}
	134
	135	newVector.addElement(new Option(
	136	"\tName of a directory to search for cost files when loading\n"
	137	+"\tcosts on demand (default current directory).",
	138	"D", 1, "-D <directory>"));
	139
	140	return newVector.elements();
	141	}
	142
	143	/**
	144	* Parses a given list of options. <p/>
	145	*
	146	<!-- options-start -->
	147	* Valid options are: <p/>
	148	*
	149	* <pre> -W <class name>
	150	* The full class name of the classifier.
	151	* eg: weka.classifiers.bayes.NaiveBayes</pre>
	152	*
	153	* <pre> -C <index>
	154	* The index of the class for which IR statistics
	155	* are to be output. (default 1)</pre>
	156	*
	157	* <pre> -I <index>
	158	* The index of an attribute to output in the
	159	* results. This attribute should identify an
	160	* instance in order to know which instances are
	161	* in the test set of a cross validation. if 0
	162	* no output (default 0).</pre>
	163	*
	164	* <pre> -P
	165	* Add target and prediction columns to the result
	166	* for each fold.</pre>
	167	*
	168	* <pre>
	169	* Options specific to classifier weka.classifiers.rules.ZeroR:
	170	* </pre>
	171	*
	172	* <pre> -D
	173	* If set, classifier is run in debug mode and
	174	* may output additional info to the console</pre>
	175	*
	176	* <pre> -D <directory>
	177	* Name of a directory to search for cost files when loading
	178	* costs on demand (default current directory).</pre>
	179	*
	180	<!-- options-end -->
	181	*
	182	* All options after -- will be passed to the classifier.
	183	*
	184	* @param options the list of options as an array of strings
	185	* @throws Exception if an option is not supported
	186	*/
	187	public void setOptions(String[] options) throws Exception {
	188
	189	String demandDir = Utils.getOption('D', options);
	190	if (demandDir.length() != 0) {
	191	setOnDemandDirectory(new File(demandDir));
	192	}
	193
	194	super.setOptions(options);
	195	}
	196
	197	/**
	198	* Gets the current settings of the Classifier.
	199	*
	200	* @return an array of strings suitable for passing to setOptions
	201	*/
	202	public String [] getOptions() {
	203
	204	String [] superOptions = super.getOptions();
	205	String [] options = new String [superOptions.length + 3];
	206	int current = 0;
	207
	208	options[current++] = "-D";
	209	options[current++] = "" + getOnDemandDirectory();
	210
	211	System.arraycopy(superOptions, 0, options, current,
	212	superOptions.length);
	213	current += superOptions.length;
	214	while (current < options.length) {
	215	options[current++] = "";
	216	}
	217	return options;
	218	}
	219
	220	/**
	221	* Returns the tip text for this property
	222	* @return tip text for this property suitable for
	223	* displaying in the explorer/experimenter gui
	224	*/
	225	public String onDemandDirectoryTipText() {
	226	return "The directory to look in for cost files. This directory will be "
	227	+"searched for cost files when loading on demand.";
	228	}
	229
	230	/**
	231	* Returns the directory that will be searched for cost files when
	232	* loading on demand.
	233	*
	234	* @return The cost file search directory.
	235	*/
	236	public File getOnDemandDirectory() {
	237
	238	return m_OnDemandDirectory;
	239	}
	240
	241	/**
	242	* Sets the directory that will be searched for cost files when
	243	* loading on demand.
	244	*
	245	* @param newDir The cost file search directory.
	246	*/
	247	public void setOnDemandDirectory(File newDir) {
	248
	249	if (newDir.isDirectory()) {
	250	m_OnDemandDirectory = newDir;
	251	} else {
	252	m_OnDemandDirectory = new File(newDir.getParent());
	253	}
	254	}
	255
	256	/**
	257	* Gets the data types of each of the result columns produced for a
	258	* single run. The number of result fields must be constant
	259	* for a given SplitEvaluator.
	260	*
	261	* @return an array containing objects of the type of each result column.
	262	* The objects should be Strings, or Doubles.
	263	*/
	264	public Object [] getResultTypes() {
	265	int addm = (m_AdditionalMeasures != null)
	266	? m_AdditionalMeasures.length
	267	: 0;
	268	Object [] resultTypes = new Object[RESULT_SIZE+addm];
	269	Double doub = new Double(0);
	270	int current = 0;
	271	resultTypes[current++] = doub;
	272	resultTypes[current++] = doub;
	273
	274	resultTypes[current++] = doub;
	275	resultTypes[current++] = doub;
	276	resultTypes[current++] = doub;
	277	resultTypes[current++] = doub;
	278	resultTypes[current++] = doub;
	279	resultTypes[current++] = doub;
	280	resultTypes[current++] = doub;
	281	resultTypes[current++] = doub;
	282
	283	resultTypes[current++] = doub;
	284	resultTypes[current++] = doub;
	285	resultTypes[current++] = doub;
	286	resultTypes[current++] = doub;
	287
	288	resultTypes[current++] = doub;
	289	resultTypes[current++] = doub;
	290	resultTypes[current++] = doub;
	291	resultTypes[current++] = doub;
	292	resultTypes[current++] = doub;
	293	resultTypes[current++] = doub;
	294
	295	resultTypes[current++] = doub;
	296	resultTypes[current++] = doub;
	297	resultTypes[current++] = doub;
	298
	299	// Timing stats
	300	resultTypes[current++] = doub;
	301	resultTypes[current++] = doub;
	302	resultTypes[current++] = doub;
	303	resultTypes[current++] = doub;
	304
	305	// sizes
	306	resultTypes[current++] = doub;
	307	resultTypes[current++] = doub;
	308	resultTypes[current++] = doub;
	309
	310	resultTypes[current++] = "";
	311
	312	// add any additional measures
	313	for (int i=0;i<addm;i++) {
	314	resultTypes[current++] = doub;
	315	}
	316	if (current != RESULT_SIZE+addm) {
	317	throw new Error("ResultTypes didn't fit RESULT_SIZE");
	318	}
	319	return resultTypes;
	320	}
	321
	322	/**
	323	* Gets the names of each of the result columns produced for a single run.
	324	* The number of result fields must be constant
	325	* for a given SplitEvaluator.
	326	*
	327	* @return an array containing the name of each result column
	328	*/
	329	public String [] getResultNames() {
	330	int addm = (m_AdditionalMeasures != null)
	331	? m_AdditionalMeasures.length
	332	: 0;
	333	String [] resultNames = new String[RESULT_SIZE+addm];
	334	int current = 0;
	335	resultNames[current++] = "Number_of_training_instances";
	336	resultNames[current++] = "Number_of_testing_instances";
	337
	338	// Basic performance stats - right vs wrong
	339	resultNames[current++] = "Number_correct";
	340	resultNames[current++] = "Number_incorrect";
	341	resultNames[current++] = "Number_unclassified";
	342	resultNames[current++] = "Percent_correct";
	343	resultNames[current++] = "Percent_incorrect";
	344	resultNames[current++] = "Percent_unclassified";
	345	resultNames[current++] = "Total_cost";
	346	resultNames[current++] = "Average_cost";
	347
	348	// Sensitive stats - certainty of predictions
	349	resultNames[current++] = "Mean_absolute_error";
	350	resultNames[current++] = "Root_mean_squared_error";
	351	resultNames[current++] = "Relative_absolute_error";
	352	resultNames[current++] = "Root_relative_squared_error";
	353
	354	// SF stats
	355	resultNames[current++] = "SF_prior_entropy";
	356	resultNames[current++] = "SF_scheme_entropy";
	357	resultNames[current++] = "SF_entropy_gain";
	358	resultNames[current++] = "SF_mean_prior_entropy";
	359	resultNames[current++] = "SF_mean_scheme_entropy";
	360	resultNames[current++] = "SF_mean_entropy_gain";
	361
	362	// K&B stats
	363	resultNames[current++] = "KB_information";
	364	resultNames[current++] = "KB_mean_information";
	365	resultNames[current++] = "KB_relative_information";
	366
	367	// Timing stats
	368	resultNames[current++] = "Elapsed_Time_training";
	369	resultNames[current++] = "Elapsed_Time_testing";
	370	resultNames[current++] = "UserCPU_Time_training";
	371	resultNames[current++] = "UserCPU_Time_testing";
	372
	373	// sizes
	374	resultNames[current++] = "Serialized_Model_Size";
	375	resultNames[current++] = "Serialized_Train_Set_Size";
	376	resultNames[current++] = "Serialized_Test_Set_Size";
	377
	378	// Classifier defined extras
	379	resultNames[current++] = "Summary";
	380	// add any additional measures
	381	for (int i=0;i<addm;i++) {
	382	resultNames[current++] = m_AdditionalMeasures[i];
	383	}
	384	if (current != RESULT_SIZE+addm) {
	385	throw new Error("ResultNames didn't fit RESULT_SIZE");
	386	}
	387	return resultNames;
	388	}
	389
	390	/**
	391	* Gets the results for the supplied train and test datasets. Now performs
	392	* a deep copy of the classifier before it is built and evaluated (just in case
	393	* the classifier is not initialized properly in buildClassifier()).
	394	*
	395	* @param train the training Instances.
	396	* @param test the testing Instances.
	397	* @return the results stored in an array. The objects stored in
	398	* the array may be Strings, Doubles, or null (for the missing value).
	399	* @throws Exception if a problem occurs while getting the results
	400	*/
	401	public Object [] getResult(Instances train, Instances test)
	402	throws Exception {
	403
	404	if (train.classAttribute().type() != Attribute.NOMINAL) {
	405	throw new Exception("Class attribute is not nominal!");
	406	}
	407	if (m_Template == null) {
	408	throw new Exception("No classifier has been specified");
	409	}
	410	ThreadMXBean thMonitor = ManagementFactory.getThreadMXBean();
	411	boolean canMeasureCPUTime = thMonitor.isThreadCpuTimeSupported();
	412	if(!thMonitor.isThreadCpuTimeEnabled())
	413	thMonitor.setThreadCpuTimeEnabled(true);
	414
	415	int addm = (m_AdditionalMeasures != null) ? m_AdditionalMeasures.length : 0;
	416	Object [] result = new Object[RESULT_SIZE+addm];
	417	long thID = Thread.currentThread().getId();
	418	long CPUStartTime=-1, trainCPUTimeElapsed=-1, testCPUTimeElapsed=-1,
	419	trainTimeStart, trainTimeElapsed, testTimeStart, testTimeElapsed;
	420
	421	String costName = train.relationName() + CostMatrix.FILE_EXTENSION;
	422	File costFile = new File(getOnDemandDirectory(), costName);
	423	if (!costFile.exists()) {
	424	throw new Exception("On-demand cost file doesn't exist: " + costFile);
	425	}
	426	CostMatrix costMatrix = new CostMatrix(new BufferedReader(
	427	new FileReader(costFile)));
	428
	429	Evaluation eval = new Evaluation(train, costMatrix);
	430	m_Classifier = AbstractClassifier.makeCopy(m_Template);
	431
	432	trainTimeStart = System.currentTimeMillis();
	433	if(canMeasureCPUTime)
	434	CPUStartTime = thMonitor.getThreadUserTime(thID);
	435	m_Classifier.buildClassifier(train);
	436	if(canMeasureCPUTime)
	437	trainCPUTimeElapsed = thMonitor.getThreadUserTime(thID) - CPUStartTime;
	438	trainTimeElapsed = System.currentTimeMillis() - trainTimeStart;
	439	testTimeStart = System.currentTimeMillis();
	440	if(canMeasureCPUTime)
	441	CPUStartTime = thMonitor.getThreadUserTime(thID);
	442	eval.evaluateModel(m_Classifier, test);
	443	if(canMeasureCPUTime)
	444	testCPUTimeElapsed = thMonitor.getThreadUserTime(thID) - CPUStartTime;
	445	testTimeElapsed = System.currentTimeMillis() - testTimeStart;
	446	thMonitor = null;
	447
	448	m_result = eval.toSummaryString();
	449	// The results stored are all per instance -- can be multiplied by the
	450	// number of instances to get absolute numbers
	451	int current = 0;
	452	result[current++] = new Double(train.numInstances());
	453	result[current++] = new Double(eval.numInstances());
	454
	455	result[current++] = new Double(eval.correct());
	456	result[current++] = new Double(eval.incorrect());
	457	result[current++] = new Double(eval.unclassified());
	458	result[current++] = new Double(eval.pctCorrect());
	459	result[current++] = new Double(eval.pctIncorrect());
	460	result[current++] = new Double(eval.pctUnclassified());
	461	result[current++] = new Double(eval.totalCost());
	462	result[current++] = new Double(eval.avgCost());
	463
	464	result[current++] = new Double(eval.meanAbsoluteError());
	465	result[current++] = new Double(eval.rootMeanSquaredError());
	466	result[current++] = new Double(eval.relativeAbsoluteError());
	467	result[current++] = new Double(eval.rootRelativeSquaredError());
	468
	469	result[current++] = new Double(eval.SFPriorEntropy());
	470	result[current++] = new Double(eval.SFSchemeEntropy());
	471	result[current++] = new Double(eval.SFEntropyGain());
	472	result[current++] = new Double(eval.SFMeanPriorEntropy());
	473	result[current++] = new Double(eval.SFMeanSchemeEntropy());
	474	result[current++] = new Double(eval.SFMeanEntropyGain());
	475
	476	// K&B stats
	477	result[current++] = new Double(eval.KBInformation());
	478	result[current++] = new Double(eval.KBMeanInformation());
	479	result[current++] = new Double(eval.KBRelativeInformation());
	480
	481	// Timing stats
	482	result[current++] = new Double(trainTimeElapsed / 1000.0);
	483	result[current++] = new Double(testTimeElapsed / 1000.0);
	484	if(canMeasureCPUTime) {
	485	result[current++] = new Double((trainCPUTimeElapsed/1000000.0) / 1000.0);
	486	result[current++] = new Double((testCPUTimeElapsed /1000000.0) / 1000.0);
	487	}
	488	else {
	489	result[current++] = new Double(Utils.missingValue());
	490	result[current++] = new Double(Utils.missingValue());
	491	}
	492
	493	// sizes
	494	ByteArrayOutputStream bastream = new ByteArrayOutputStream();
	495	ObjectOutputStream oostream = new ObjectOutputStream(bastream);
	496	oostream.writeObject(m_Classifier);
	497	result[current++] = new Double(bastream.size());
	498	bastream = new ByteArrayOutputStream();
	499	oostream = new ObjectOutputStream(bastream);
	500	oostream.writeObject(train);
	501	result[current++] = new Double(bastream.size());
	502	bastream = new ByteArrayOutputStream();
	503	oostream = new ObjectOutputStream(bastream);
	504	oostream.writeObject(test);
	505	result[current++] = new Double(bastream.size());
	506
	507	if (m_Classifier instanceof Summarizable) {
	508	result[current++] = ((Summarizable)m_Classifier).toSummaryString();
	509	} else {
	510	result[current++] = null;
	511	}
	512
	513	for (int i=0;i<addm;i++) {
	514	if (m_doesProduce[i]) {
	515	try {
	516	double dv = ((AdditionalMeasureProducer)m_Classifier).
	517	getMeasure(m_AdditionalMeasures[i]);
	518	if (!Utils.isMissingValue(dv)) {
	519	Double value = new Double(dv);
	520	result[current++] = value;
	521	} else {
	522	result[current++] = null;
	523	}
	524	} catch (Exception ex) {
	525	System.err.println(ex);
	526	}
	527	} else {
	528	result[current++] = null;
	529	}
	530	}
	531
	532	if (current != RESULT_SIZE+addm) {
	533	throw new Error("Results didn't fit RESULT_SIZE");
	534	}
	535	return result;
	536	}
	537
	538	/**
	539	* Returns a text description of the split evaluator.
	540	*
	541	* @return a text description of the split evaluator.
	542	*/
	543	public String toString() {
	544
	545	String result = "CostSensitiveClassifierSplitEvaluator: ";
	546	if (m_Template == null) {
	547	return result + "<null> classifier";
	548	}
	549	return result + m_Template.getClass().getName() + " "
	550	+ m_ClassifierOptions + "(version " + m_ClassifierVersion + ")";
	551	}
	552
	553	/**
	554	* Returns the revision string.
	555	*
	556	* @return the revision
	557	*/
	558	public String getRevision() {
	559	return RevisionUtils.extract("$Revision: 5987 $");
	560	}
	561	} // CostSensitiveClassifierSplitEvaluator

Note: See TracBrowser for help on using the repository browser.

Download in other formats: