Context Navigation

source: src/main/java/weka/classifiers/evaluation/ThresholdCurve.java @ 14

Last change on this file since 14 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 15.7 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* ThresholdCurve.java
19	* Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.classifiers.evaluation;
24
25	import weka.classifiers.Classifier;
26	import weka.classifiers.AbstractClassifier;
27	import weka.core.Attribute;
28	import weka.core.FastVector;
29	import weka.core.Instance;
30	import weka.core.DenseInstance;
31	import weka.core.Instances;
32	import weka.core.RevisionHandler;
33	import weka.core.RevisionUtils;
34	import weka.core.Utils;
35
36	/**
37	* Generates points illustrating prediction tradeoffs that can be obtained
38	* by varying the threshold value between classes. For example, the typical
39	* threshold value of 0.5 means the predicted probability of "positive" must be
40	* higher than 0.5 for the instance to be predicted as "positive". The
41	* resulting dataset can be used to visualize precision/recall tradeoff, or
42	* for ROC curve analysis (true positive rate vs false positive rate).
43	* Weka just varies the threshold on the class probability estimates in each
44	* case. The Mann Whitney statistic is used to calculate the AUC.
45	*
46	* @author Len Trigg (len@reeltwo.com)
47	* @version $Revision: 5987 $
48	*/
49	public class ThresholdCurve
50	implements RevisionHandler {
51
52	/** The name of the relation used in threshold curve datasets */
53	public static final String RELATION_NAME = "ThresholdCurve";
54
55	/** attribute name: True Positives */
56	public static final String TRUE_POS_NAME = "True Positives";
57	/** attribute name: False Negatives */
58	public static final String FALSE_NEG_NAME = "False Negatives";
59	/** attribute name: False Positives */
60	public static final String FALSE_POS_NAME = "False Positives";
61	/** attribute name: True Negatives */
62	public static final String TRUE_NEG_NAME = "True Negatives";
63	/** attribute name: False Positive Rate" */
64	public static final String FP_RATE_NAME = "False Positive Rate";
65	/** attribute name: True Positive Rate */
66	public static final String TP_RATE_NAME = "True Positive Rate";
67	/** attribute name: Precision */
68	public static final String PRECISION_NAME = "Precision";
69	/** attribute name: Recall */
70	public static final String RECALL_NAME = "Recall";
71	/** attribute name: Fallout */
72	public static final String FALLOUT_NAME = "Fallout";
73	/** attribute name: FMeasure */
74	public static final String FMEASURE_NAME = "FMeasure";
75	/** attribute name: Sample Size */
76	public static final String SAMPLE_SIZE_NAME = "Sample Size";
77	/** attribute name: Lift */
78	public static final String LIFT_NAME = "Lift";
79	/** attribute name: Threshold */
80	public static final String THRESHOLD_NAME = "Threshold";
81
82	/**
83	* Calculates the performance stats for the default class and return
84	* results as a set of Instances. The
85	* structure of these Instances is as follows:<p> <ul>
86	* <li> <b>True Positives </b>
87	* <li> <b>False Negatives</b>
88	* <li> <b>False Positives</b>
89	* <li> <b>True Negatives</b>
90	* <li> <b>False Positive Rate</b>
91	* <li> <b>True Positive Rate</b>
92	* <li> <b>Precision</b>
93	* <li> <b>Recall</b>
94	* <li> <b>Fallout</b>
95	* <li> <b>Threshold</b> contains the probability threshold that gives
96	* rise to the previous performance values.
97	* </ul> <p>
98	* For the definitions of these measures, see TwoClassStats <p>
99	*
100	* @see TwoClassStats
101	* @param predictions the predictions to base the curve on
102	* @return datapoints as a set of instances, null if no predictions
103	* have been made.
104	*/
105	public Instances getCurve(FastVector predictions) {
106
107	if (predictions.size() == 0) {
108	return null;
109	}
110	return getCurve(predictions,
111	((NominalPrediction)predictions.elementAt(0))
112	.distribution().length - 1);
113	}
114
115	/**
116	* Calculates the performance stats for the desired class and return
117	* results as a set of Instances.
118	*
119	* @param predictions the predictions to base the curve on
120	* @param classIndex index of the class of interest.
121	* @return datapoints as a set of instances.
122	*/
123	public Instances getCurve(FastVector predictions, int classIndex) {
124
125	if ((predictions.size() == 0) \|\|
126	(((NominalPrediction)predictions.elementAt(0))
127	.distribution().length <= classIndex)) {
128	return null;
129	}
130
131	double totPos = 0, totNeg = 0;
132	double [] probs = getProbabilities(predictions, classIndex);
133
134	// Get distribution of positive/negatives
135	for (int i = 0; i < probs.length; i++) {
136	NominalPrediction pred = (NominalPrediction)predictions.elementAt(i);
137	if (pred.actual() == Prediction.MISSING_VALUE) {
138	System.err.println(getClass().getName()
139	+ " Skipping prediction with missing class value");
140	continue;
141	}
142	if (pred.weight() < 0) {
143	System.err.println(getClass().getName()
144	+ " Skipping prediction with negative weight");
145	continue;
146	}
147	if (pred.actual() == classIndex) {
148	totPos += pred.weight();
149	} else {
150	totNeg += pred.weight();
151	}
152	}
153
154	Instances insts = makeHeader();
155	int [] sorted = Utils.sort(probs);
156	TwoClassStats tc = new TwoClassStats(totPos, totNeg, 0, 0);
157	double threshold = 0;
158	double cumulativePos = 0;
159	double cumulativeNeg = 0;
160	for (int i = 0; i < sorted.length; i++) {
161
162	if ((i == 0) \|\| (probs[sorted[i]] > threshold)) {
163	tc.setTruePositive(tc.getTruePositive() - cumulativePos);
164	tc.setFalseNegative(tc.getFalseNegative() + cumulativePos);
165	tc.setFalsePositive(tc.getFalsePositive() - cumulativeNeg);
166	tc.setTrueNegative(tc.getTrueNegative() + cumulativeNeg);
167	threshold = probs[sorted[i]];
168	insts.add(makeInstance(tc, threshold));
169	cumulativePos = 0;
170	cumulativeNeg = 0;
171	if (i == sorted.length - 1) {
172	break;
173	}
174	}
175
176	NominalPrediction pred = (NominalPrediction)predictions.elementAt(sorted[i]);
177
178	if (pred.actual() == Prediction.MISSING_VALUE) {
179	System.err.println(getClass().getName()
180	+ " Skipping prediction with missing class value");
181	continue;
182	}
183	if (pred.weight() < 0) {
184	System.err.println(getClass().getName()
185	+ " Skipping prediction with negative weight");
186	continue;
187	}
188	if (pred.actual() == classIndex) {
189	cumulativePos += pred.weight();
190	} else {
191	cumulativeNeg += pred.weight();
192	}
193
194	/*
195	System.out.println(tc + " " + probs[sorted[i]]
196	+ " " + (pred.actual() == classIndex));
197	*/
198	/*if ((i != (sorted.length - 1)) &&
199	((i == 0) \|\|
200	(probs[sorted[i]] != probs[sorted[i - 1]]))) {
201	insts.add(makeInstance(tc, probs[sorted[i]]));
202	}*/
203	}
204	return insts;
205	}
206
207	/**
208	* Calculates the n point precision result, which is the precision averaged
209	* over n evenly spaced (w.r.t recall) samples of the curve.
210	*
211	* @param tcurve a previously extracted threshold curve Instances.
212	* @param n the number of points to average over.
213	* @return the n-point precision.
214	*/
215	public static double getNPointPrecision(Instances tcurve, int n) {
216
217	if (!RELATION_NAME.equals(tcurve.relationName())
218	\|\| (tcurve.numInstances() == 0)) {
219	return Double.NaN;
220	}
221	int recallInd = tcurve.attribute(RECALL_NAME).index();
222	int precisInd = tcurve.attribute(PRECISION_NAME).index();
223	double [] recallVals = tcurve.attributeToDoubleArray(recallInd);
224	int [] sorted = Utils.sort(recallVals);
225	double isize = 1.0 / (n - 1);
226	double psum = 0;
227	for (int i = 0; i < n; i++) {
228	int pos = binarySearch(sorted, recallVals, i * isize);
229	double recall = recallVals[sorted[pos]];
230	double precis = tcurve.instance(sorted[pos]).value(precisInd);
231	/*
232	System.err.println("Point " + (i + 1) + ": i=" + pos
233	+ " r=" + (i * isize)
234	+ " p'=" + precis
235	+ " r'=" + recall);
236	*/
237	// interpolate figures for non-endpoints
238	while ((pos != 0) && (pos < sorted.length - 1)) {
239	pos++;
240	double recall2 = recallVals[sorted[pos]];
241	if (recall2 != recall) {
242	double precis2 = tcurve.instance(sorted[pos]).value(precisInd);
243	double slope = (precis2 - precis) / (recall2 - recall);
244	double offset = precis - recall * slope;
245	precis = isize * i * slope + offset;
246	/*
247	System.err.println("Point2 " + (i + 1) + ": i=" + pos
248	+ " r=" + (i * isize)
249	+ " p'=" + precis2
250	+ " r'=" + recall2
251	+ " p''=" + precis);
252	*/
253	break;
254	}
255	}
256	psum += precis;
257	}
258	return psum / n;
259	}
260
261	/**
262	* Calculates the area under the ROC curve as the Wilcoxon-Mann-Whitney statistic.
263	*
264	* @param tcurve a previously extracted threshold curve Instances.
265	* @return the ROC area, or Double.NaN if you don't pass in
266	* a ThresholdCurve generated Instances.
267	*/
268	public static double getROCArea(Instances tcurve) {
269
270	final int n = tcurve.numInstances();
271	if (!RELATION_NAME.equals(tcurve.relationName())
272	\|\| (n == 0)) {
273	return Double.NaN;
274	}
275	final int tpInd = tcurve.attribute(TRUE_POS_NAME).index();
276	final int fpInd = tcurve.attribute(FALSE_POS_NAME).index();
277	final double [] tpVals = tcurve.attributeToDoubleArray(tpInd);
278	final double [] fpVals = tcurve.attributeToDoubleArray(fpInd);
279
280	double area = 0.0, cumNeg = 0.0;
281	final double totalPos = tpVals[0];
282	final double totalNeg = fpVals[0];
283	for (int i = 0; i < n; i++) {
284	double cip, cin;
285	if (i < n - 1) {
286	cip = tpVals[i] - tpVals[i + 1];
287	cin = fpVals[i] - fpVals[i + 1];
288	} else {
289	cip = tpVals[n - 1];
290	cin = fpVals[n - 1];
291	}
292	area += cip * (cumNeg + (0.5 * cin));
293	cumNeg += cin;
294	}
295	area /= (totalNeg * totalPos);
296
297	return area;
298	}
299
300	/**
301	* Gets the index of the instance with the closest threshold value to the
302	* desired target
303	*
304	* @param tcurve a set of instances that have been generated by this class
305	* @param threshold the target threshold
306	* @return the index of the instance that has threshold closest to
307	* the target, or -1 if this could not be found (i.e. no data, or
308	* bad threshold target)
309	*/
310	public static int getThresholdInstance(Instances tcurve, double threshold) {
311
312	if (!RELATION_NAME.equals(tcurve.relationName())
313	\|\| (tcurve.numInstances() == 0)
314	\|\| (threshold < 0)
315	\|\| (threshold > 1.0)) {
316	return -1;
317	}
318	if (tcurve.numInstances() == 1) {
319	return 0;
320	}
321	double [] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1);
322	int [] sorted = Utils.sort(tvals);
323	return binarySearch(sorted, tvals, threshold);
324	}
325
326	/**
327	* performs a binary search
328	*
329	* @param index the indices
330	* @param vals the values
331	* @param target the target to look for
332	* @return the index of the target
333	*/
334	private static int binarySearch(int [] index, double [] vals, double target) {
335
336	int lo = 0, hi = index.length - 1;
337	while (hi - lo > 1) {
338	int mid = lo + (hi - lo) / 2;
339	double midval = vals[index[mid]];
340	if (target > midval) {
341	lo = mid;
342	} else if (target < midval) {
343	hi = mid;
344	} else {
345	while ((mid > 0) && (vals[index[mid - 1]] == target)) {
346	mid --;
347	}
348	return mid;
349	}
350	}
351	return lo;
352	}
353
354	/**
355	*
356	* @param predictions the predictions to use
357	* @param classIndex the class index
358	* @return the probabilities
359	*/
360	private double [] getProbabilities(FastVector predictions, int classIndex) {
361
362	// sort by predicted probability of the desired class.
363	double [] probs = new double [predictions.size()];
364	for (int i = 0; i < probs.length; i++) {
365	NominalPrediction pred = (NominalPrediction)predictions.elementAt(i);
366	probs[i] = pred.distribution()[classIndex];
367	}
368	return probs;
369	}
370
371	/**
372	* generates the header
373	*
374	* @return the header
375	*/
376	private Instances makeHeader() {
377
378	FastVector fv = new FastVector();
379	fv.addElement(new Attribute(TRUE_POS_NAME));
380	fv.addElement(new Attribute(FALSE_NEG_NAME));
381	fv.addElement(new Attribute(FALSE_POS_NAME));
382	fv.addElement(new Attribute(TRUE_NEG_NAME));
383	fv.addElement(new Attribute(FP_RATE_NAME));
384	fv.addElement(new Attribute(TP_RATE_NAME));
385	fv.addElement(new Attribute(PRECISION_NAME));
386	fv.addElement(new Attribute(RECALL_NAME));
387	fv.addElement(new Attribute(FALLOUT_NAME));
388	fv.addElement(new Attribute(FMEASURE_NAME));
389	fv.addElement(new Attribute(SAMPLE_SIZE_NAME));
390	fv.addElement(new Attribute(LIFT_NAME));
391	fv.addElement(new Attribute(THRESHOLD_NAME));
392	return new Instances(RELATION_NAME, fv, 100);
393	}
394
395	/**
396	* generates an instance out of the given data
397	*
398	* @param tc the statistics
399	* @param prob the probability
400	* @return the generated instance
401	*/
402	private Instance makeInstance(TwoClassStats tc, double prob) {
403
404	int count = 0;
405	double [] vals = new double[13];
406	vals[count++] = tc.getTruePositive();
407	vals[count++] = tc.getFalseNegative();
408	vals[count++] = tc.getFalsePositive();
409	vals[count++] = tc.getTrueNegative();
410	vals[count++] = tc.getFalsePositiveRate();
411	vals[count++] = tc.getTruePositiveRate();
412	vals[count++] = tc.getPrecision();
413	vals[count++] = tc.getRecall();
414	vals[count++] = tc.getFallout();
415	vals[count++] = tc.getFMeasure();
416	double ss = (tc.getTruePositive() + tc.getFalsePositive()) /
417	(tc.getTruePositive() + tc.getFalsePositive() + tc.getTrueNegative() + tc.getFalseNegative());
418	vals[count++] = ss;
419	double expectedByChance = (ss * (tc.getTruePositive() + tc.getFalseNegative()));
420	if (expectedByChance < 1) {
421	vals[count++] = Utils.missingValue();
422	} else {
423	vals[count++] = tc.getTruePositive() / expectedByChance;
424
425	}
426	vals[count++] = prob;
427	return new DenseInstance(1.0, vals);
428	}
429
430	/**
431	* Returns the revision string.
432	*
433	* @return the revision
434	*/
435	public String getRevision() {
436	return RevisionUtils.extract("$Revision: 5987 $");
437	}
438
439	/**
440	* Tests the ThresholdCurve generation from the command line.
441	* The classifier is currently hardcoded. Pipe in an arff file.
442	*
443	* @param args currently ignored
444	*/
445	public static void main(String [] args) {
446
447	try {
448
449	Instances inst = new Instances(new java.io.InputStreamReader(System.in));
450	if (false) {
451	System.out.println(ThresholdCurve.getNPointPrecision(inst, 11));
452	} else {
453	inst.setClassIndex(inst.numAttributes() - 1);
454	ThresholdCurve tc = new ThresholdCurve();
455	EvaluationUtils eu = new EvaluationUtils();
456	Classifier classifier = new weka.classifiers.functions.Logistic();
457	FastVector predictions = new FastVector();
458	for (int i = 0; i < 2; i++) { // Do two runs.
459	eu.setSeed(i);
460	predictions.appendElements(eu.getCVPredictions(classifier, inst, 10));
461	//System.out.println("\n\n\n");
462	}
463	Instances result = tc.getCurve(predictions);
464	System.out.println(result);
465	}
466	} catch (Exception ex) {
467	ex.printStackTrace();
468	}
469	}
470	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: