Context Navigation

source: src/main/java/weka/estimators/UnivariateKernelEstimator.java @ 17

Last change on this file since 17 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 14.7 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* UnivariateKernelEstimator.java
19	* Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.estimators;
24
25	import java.util.Random;
26	import java.util.Collection;
27	import java.util.Set;
28	import java.util.Map;
29	import java.util.Iterator;
30	import java.util.TreeMap;
31	import java.util.ArrayList;
32
33	import weka.core.Statistics;
34	import weka.core.Utils;
35
36	/**
37	* Simple weighted kernel density estimator.
38	*
39	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
40	* @version $Revision: 5680 $
41	*/
42	public class UnivariateKernelEstimator implements UnivariateDensityEstimator,
43	UnivariateIntervalEstimator {
44
45	/** The collection used to store the weighted values. */
46	protected TreeMap<Double, Double> m_TM = new TreeMap<Double, Double>();
47
48	/** The weighted sum of values */
49	protected double m_WeightedSum = 0;
50
51	/** The weighted sum of squared values */
52	protected double m_WeightedSumSquared = 0;
53
54	/** The weight of the values collected so far */
55	protected double m_SumOfWeights = 0;
56
57	/** The current bandwidth (only computed when needed) */
58	protected double m_Width = Double.MAX_VALUE;
59
60	/** The exponent to use in computation of bandwidth (default: -0.25) */
61	protected double m_Exponent = -0.25;
62
63	/** The minimum allowed value of the kernel width (default: 1.0E-6) */
64	protected double m_MinWidth = 1.0E-6;
65
66	/** Constant for Gaussian density. */
67	public static final double CONST = - 0.5 * Math.log(2 * Math.PI);
68
69	/** Threshold at which further kernels are no longer added to sum. */
70	protected double m_Threshold = 1.0E-6;
71
72	/** The number of intervals used to approximate prediction interval. */
73	protected int m_NumIntervals = 1000;
74
75	/**
76	* Adds a value to the density estimator.
77	*
78	* @param value the value to add
79	* @param weight the weight of the value
80	*/
81	public void addValue(double value, double weight) {
82
83	m_WeightedSum += value * weight;
84	m_WeightedSumSquared += value * value * weight;
85	m_SumOfWeights += weight;
86	if (m_TM.get(value) == null) {
87	m_TM.put(value, weight);
88	} else {
89	m_TM.put(value, m_TM.get(value) + weight);
90	}
91	}
92
93	/**
94	* Updates bandwidth: the sample standard deviation is multiplied by
95	* the total weight to the power of the given exponent.
96	*
97	* If the total weight is not greater than zero, the width is set to
98	* Double.MAX_VALUE. If that is not the case, but the width becomes
99	* smaller than m_MinWidth, the width is set to the value of
100	* m_MinWidth.
101	*/
102	public void updateWidth() {
103
104	// OK, need to do some work
105	if (m_SumOfWeights > 0) {
106
107	// Compute variance for scaling
108	double mean = m_WeightedSum / m_SumOfWeights;
109	double variance = m_WeightedSumSquared / m_SumOfWeights - mean * mean;
110	if (variance < 0) {
111	variance = 0;
112	}
113
114	// Compute kernel bandwidth
115	m_Width = Math.sqrt(variance) * Math.pow(m_SumOfWeights, m_Exponent);
116
117	if (m_Width <= m_MinWidth) {
118	m_Width = m_MinWidth;
119	}
120	} else {
121	m_Width = Double.MAX_VALUE;
122	}
123	}
124
125
126	/**
127	* Returns the interval for the given confidence value.
128	*
129	* @param conf the confidence value in the interval [0, 1]
130	* @return the interval
131	*/
132	public double[][] predictIntervals(double conf) {
133
134	// Update the bandwidth
135	updateWidth();
136
137	// Compute minimum and maximum value, and delta
138	double val = Statistics.normalInverse(1.0 - (1.0 - conf) / 2);
139	double min = m_TM.firstKey() - val * m_Width;
140	double max = m_TM.lastKey() + val * m_Width;
141	double delta = (max - min) / m_NumIntervals;
142
143	// Create array with estimated probabilities
144	double[] probabilities = new double[m_NumIntervals];
145	double leftVal = Math.exp(logDensity(min));
146	for (int i = 0; i < m_NumIntervals; i++) {
147	double rightVal = Math.exp(logDensity(min + (i + 1) * delta));
148	probabilities[i] = 0.5 * (leftVal + rightVal) * delta;
149	leftVal = rightVal;
150	}
151
152	// Sort array based on area of bin estimates
153	int[] sortedIndices = Utils.sort(probabilities);
154
155	// Mark the intervals to use
156	double sum = 0;
157	boolean[] toUse = new boolean[probabilities.length];
158	int k = 0;
159	while ((sum < conf) && (k < toUse.length)){
160	toUse[sortedIndices[toUse.length - (k + 1)]] = true;
161	sum += probabilities[sortedIndices[toUse.length - (k + 1)]];
162	k++;
163	}
164
165	// Don't need probabilities anymore
166	probabilities = null;
167
168	// Create final list of intervals
169	ArrayList<double[]> intervals = new ArrayList<double[]>();
170
171	// The current interval
172	double[] interval = null;
173
174	// Iterate through kernels
175	boolean haveStartedInterval = false;
176	for (int i = 0; i < m_NumIntervals; i++) {
177
178	// Should the current bin be used?
179	if (toUse[i]) {
180
181	// Do we need to create a new interval?
182	if (haveStartedInterval == false) {
183	haveStartedInterval = true;
184	interval = new double[2];
185	interval[0] = min + i * delta;
186	}
187
188	// Regardless, we should update the upper boundary
189	interval[1] = min + (i + 1) * delta;
190	} else {
191
192	// We need to finalize and store the last interval
193	// if necessary.
194	if (haveStartedInterval) {
195	haveStartedInterval = false;
196	intervals.add(interval);
197	}
198	}
199	}
200
201	// Add last interval if there is one
202	if (haveStartedInterval) {
203	intervals.add(interval);
204	}
205
206	return intervals.toArray(new double[0][0]);
207	}
208
209	/**
210	* Computes the logarithm of x and y given the logarithms of x and y.
211	*
212	* This is based on Tobias P. Mann's description in "Numerically
213	* Stable Hidden Markov Implementation" (2006).
214	*/
215	protected double logOfSum(double logOfX, double logOfY) {
216
217	// Check for cases where log of zero is present
218	if (Double.isNaN(logOfX)) {
219	return logOfY;
220	}
221	if (Double.isNaN(logOfY)) {
222	return logOfX;
223	}
224
225	// Otherwise return proper result, taken care of overflows
226	if (logOfX > logOfY) {
227	return logOfX + Math.log(1 + Math.exp(logOfY - logOfX));
228	} else {
229	return logOfY + Math.log(1 + Math.exp(logOfX - logOfY));
230	}
231	}
232
233	/**
234	* Compute running sum of density values and weights.
235	*/
236	protected void runningSum(Set<Map.Entry<Double,Double>> c, double value,
237	double[] sums) {
238
239	// Auxiliary variables
240	double offset = CONST - Math.log(m_Width);
241	double logFactor = Math.log(m_Threshold) - Math.log(1 - m_Threshold);
242	double logSumOfWeights = Math.log(m_SumOfWeights);
243
244	// Iterate through values
245	Iterator<Map.Entry<Double,Double>> itr = c.iterator();
246	while(itr.hasNext()) {
247	Map.Entry<Double,Double> entry = itr.next();
248
249	// Skip entry if weight is zero because it cannot contribute to sum
250	if (entry.getValue() > 0) {
251	double diff = (entry.getKey() - value) / m_Width;
252	double logDensity = offset - 0.5 * diff * diff;
253	double logWeight = Math.log(entry.getValue());
254	sums[0] = logOfSum(sums[0], logWeight + logDensity);
255	sums[1] = logOfSum(sums[1], logWeight);
256
257	// Can we stop assuming worst case?
258	if (logDensity + logSumOfWeights < logOfSum(logFactor + sums[0], logDensity + sums[1])) {
259	break;
260	}
261	}
262	}
263	}
264
265	/**
266	* Returns the natural logarithm of the density estimate at the given
267	* point.
268	*
269	* @param value the value at which to evaluate
270	* @return the natural logarithm of the density estimate at the given
271	* value
272	*/
273	public double logDensity(double value) {
274
275	// Update the bandwidth
276	updateWidth();
277
278	// Array used to keep running sums
279	double[] sums = new double[2];
280	sums[0] = Double.NaN;
281	sums[1] = Double.NaN;
282
283	// Examine right-hand size of value
284	runningSum(m_TM.tailMap(value, true).entrySet(), value, sums);
285
286	// Examine left-hand size of value
287	runningSum(m_TM.headMap(value, false).descendingMap().entrySet(), value, sums);
288
289	// Need to normalize
290	return sums[0] - Math.log(m_SumOfWeights);
291	}
292
293	/**
294	* Returns textual description of this estimator.
295	*/
296	public String toString() {
297
298	return "Kernel estimator with bandwidth " + m_Width +
299	" and total weight " + m_SumOfWeights +
300	" based on\n" + m_TM.toString();
301	}
302
303	/**
304	* Main method, used for testing this class.
305	*/
306	public static void main(String[] args) {
307
308	// Get random number generator initialized by system
309	Random r = new Random();
310
311	// Create density estimator
312	UnivariateKernelEstimator e = new UnivariateKernelEstimator();
313
314	// Output the density estimator
315	System.out.println(e);
316
317	// Monte Carlo integration
318	double sum = 0;
319	for (int i = 0; i < 1000; i++) {
320	sum += Math.exp(e.logDensity(r.nextDouble() * 10.0 - 5.0));
321	}
322	System.out.println("Approximate integral: " + 10.0 * sum / 1000);
323
324	// Add Gaussian values into it
325	for (int i = 0; i < 1000; i++) {
326	e.addValue(0.1 * r.nextGaussian() - 3, 1);
327	e.addValue(r.nextGaussian() * 0.25, 3);
328	}
329
330	// Monte Carlo integration
331	sum = 0;
332	int points = 10000;
333	for (int i = 0; i < points; i++) {
334	double value = r.nextDouble() * 10.0 - 5.0;
335	sum += Math.exp(e.logDensity(value));
336	}
337	System.out.println("Approximate integral: " + 10.0 * sum / points);
338
339	// Check interval estimates
340	double[][] Intervals = e.predictIntervals(0.9);
341
342	System.out.println("Printing kernel intervals ---------------------");
343
344	for (int k = 0; k < Intervals.length; k++) {
345	System.out.println("Left: " + Intervals[k][0] + "\t Right: " + Intervals[k][1]);
346	}
347
348	System.out.println("Finished kernel printing intervals ---------------------");
349
350	double Covered = 0;
351	for (int i = 0; i < 1000; i++) {
352	double val = -1;
353	if (r.nextDouble() < 0.25) {
354	val = 0.1 * r.nextGaussian() - 3.0;
355	} else {
356	val = r.nextGaussian() * 0.25;
357	}
358	for (int k = 0; k < Intervals.length; k++) {
359	if (val >= Intervals[k][0] && val <= Intervals[k][1]) {
360	Covered++;
361	break;
362	}
363	}
364	}
365	System.out.println("Coverage at 0.9 level for kernel intervals: " + Covered / 1000);
366
367	// Compare performance to normal estimator on normally distributed data
368	UnivariateKernelEstimator eKernel = new UnivariateKernelEstimator();
369	UnivariateNormalEstimator eNormal = new UnivariateNormalEstimator();
370
371	for (int j = 1; j < 5; j++) {
372	double numTrain = Math.pow(10, j);
373	System.out.println("Number of training cases: " +
374	numTrain);
375
376	// Add training cases
377	for (int i = 0; i < numTrain; i++) {
378	double val = r.nextGaussian() * 1.5 + 0.5;
379	eKernel.addValue(val, 1);
380	eNormal.addValue(val, 1);
381	}
382
383	// Monte Carlo integration
384	sum = 0;
385	points = 10000;
386	for (int i = 0; i < points; i++) {
387	double value = r.nextDouble() * 20.0 - 10.0;
388	sum += Math.exp(eKernel.logDensity(value));
389	}
390	System.out.println("Approximate integral for kernel estimator: " + 20.0 * sum / points);
391
392	// Evaluate estimators
393	double loglikelihoodKernel = 0, loglikelihoodNormal = 0;
394	for (int i = 0; i < 1000; i++) {
395	double val = r.nextGaussian() * 1.5 + 0.5;
396	loglikelihoodKernel += eKernel.logDensity(val);
397	loglikelihoodNormal += eNormal.logDensity(val);
398	}
399	System.out.println("Loglikelihood for kernel estimator: " +
400	loglikelihoodKernel / 1000);
401	System.out.println("Loglikelihood for normal estimator: " +
402	loglikelihoodNormal / 1000);
403
404	// Check interval estimates
405	double[][] kernelIntervals = eKernel.predictIntervals(0.95);
406	double[][] normalIntervals = eNormal.predictIntervals(0.95);
407
408	System.out.println("Printing kernel intervals ---------------------");
409
410	for (int k = 0; k < kernelIntervals.length; k++) {
411	System.out.println("Left: " + kernelIntervals[k][0] + "\t Right: " + kernelIntervals[k][1]);
412	}
413
414	System.out.println("Finished kernel printing intervals ---------------------");
415
416	System.out.println("Printing normal intervals ---------------------");
417
418	for (int k = 0; k < normalIntervals.length; k++) {
419	System.out.println("Left: " + normalIntervals[k][0] + "\t Right: " + normalIntervals[k][1]);
420	}
421
422	System.out.println("Finished normal printing intervals ---------------------");
423
424	double kernelCovered = 0;
425	double normalCovered = 0;
426	for (int i = 0; i < 1000; i++) {
427	double val = r.nextGaussian() * 1.5 + 0.5;
428	for (int k = 0; k < kernelIntervals.length; k++) {
429	if (val >= kernelIntervals[k][0] && val <= kernelIntervals[k][1]) {
430	kernelCovered++;
431	break;
432	}
433	}
434	for (int k = 0; k < normalIntervals.length; k++) {
435	if (val >= normalIntervals[k][0] && val <= normalIntervals[k][1]) {
436	normalCovered++;
437	break;
438	}
439	}
440	}
441	System.out.println("Coverage at 0.95 level for kernel intervals: " + kernelCovered / 1000);
442	System.out.println("Coverage at 0.95 level for normal intervals: " + normalCovered / 1000);
443
444	kernelIntervals = eKernel.predictIntervals(0.8);
445	normalIntervals = eNormal.predictIntervals(0.8);
446	kernelCovered = 0;
447	normalCovered = 0;
448	for (int i = 0; i < 1000; i++) {
449	double val = r.nextGaussian() * 1.5 + 0.5;
450	for (int k = 0; k < kernelIntervals.length; k++) {
451	if (val >= kernelIntervals[k][0] && val <= kernelIntervals[k][1]) {
452	kernelCovered++;
453	break;
454	}
455	}
456	for (int k = 0; k < normalIntervals.length; k++) {
457	if (val >= normalIntervals[k][0] && val <= normalIntervals[k][1]) {
458	normalCovered++;
459	break;
460	}
461	}
462	}
463	System.out.println("Coverage at 0.8 level for kernel intervals: " + kernelCovered / 1000);
464	System.out.println("Coverage at 0.8 level for normal intervals: " + normalCovered / 1000);
465	}
466	}
467	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: