Context Navigation

source: src/main/java/weka/classifiers/meta/ensembleSelection/ModelBag.java @ 9

Last change on this file since 9 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 22.1 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* EnsembleSelection.java
19	* Copyright (C) 2006 David Michael
20	*
21	*/
22
23	package weka.classifiers.meta.ensembleSelection;
24
25	import weka.classifiers.Evaluation;
26	import weka.core.Instances;
27	import weka.core.RevisionHandler;
28	import weka.core.RevisionUtils;
29
30	import java.util.Random;
31
32	/**
33	* This class is responsible for the duties of a bag of models. It is designed
34	* for use with the EnsembleSelection meta classifier. It handles shuffling the
35	* models, doing sort initialization, performing forward selection/ backwards
36	* elimination, etc.
37	* <p/>
38	* We utilize a simple "virtual indexing" scheme inside. If we shuffle and/or
39	* sort the models, we change the "virtual" order around. The elements of the
40	* bag are always those elements with virtual index 0..(m_bagSize-1). Each
41	* "virtual" index maps to some real index in m_models. Not every model in
42	* m_models gets a virtual index... the virtual indexing is what defines the
43	* subset of models of which our Bag is composed. This makes it easy to refer to
44	* models in the bag, by their virtual index, while maintaining the original
45	* indexing for our clients.
46	*
47	* @author David Michael
48	* @version $Revision: 1.2 $
49	*/
50	public class ModelBag
51	implements RevisionHandler {
52
53	/**
54	* The "models", as a multidimensional array of predictions for the
55	* validation set. The first index is the model index, the second index is
56	* the index of the instance, and the third is the typical "class" index for
57	* a prediction's distribution. This is given to us in the constructor, and
58	* we never change it.
59	*/
60	private double m_models[][][];
61
62	/**
63	* Maps each model in our virtual indexing scheme to its original index as
64	* it is in m_models. The first m_bag_size elements here are considered our
65	* bag. Throughout the code, we use the index in to this array to refer to a
66	* model. When we shuffle the models, we really simply shuffle this array.
67	* When we want to refer back to the original model, it is easily looked up
68	* in this array. That is, if j = m_model_index[i], then m_models[j] is the
69	* model referred to by "virtual index" i. Models can easily be accessed by
70	* their virtual index using the "model()" method.
71	*/
72	private int m_modelIndex[];
73
74	/**
75	* The number of models in our bag. 1 <= m_bag_size <= m_models.length
76	*/
77	private int m_bagSize;
78
79	/**
80	* The total number of models chosen thus far for this bag. This value is
81	* important when calculating the predictions for the bag. (See
82	* computePredictions).
83	*/
84	private int m_numChosen;
85
86	/**
87	* The number of times each model has been chosen. Also can be thought of as
88	* the weight for each model. Indexed by the "virtual index".
89	*/
90	private int m_timesChosen[];
91
92	/**
93	* If true, print out debug information.
94	*/
95	private boolean m_debug;
96
97	/**
98	* Double representing the best performance achieved thus far in this bag.
99	* This Must be updated each time we make a change to the bag that improves
100	* performance. This is so that after all hillclimbing is completed, we can
101	* go back to the best ensemble that we encountered during hillclimbing.
102	*/
103	private double m_bestPerformance;
104
105	/**
106	* Array representing the weights for all the models which achieved the best
107	* performance thus far for the bag (i.e., the weights that achieved
108	* m_bestPerformance. This Must be updated each time we make a change to the
109	* bag (that improves performance, by calling updateBestTimesChosen. This is
110	* so that after all hillclimbing is completed, we can go back to the best
111	* ensemble that we encountered during hillclimbing. This array, unlike
112	* m_timesChosen, uses the original indexing as taken from m_models. That
113	* way, any time getModelWeights is called (which returns this array), the
114	* array is in the correct format for our client.
115	*/
116	private int m_bestTimesChosen[];
117
118	/**
119	* Constructor for ModelBag.
120	*
121	* @param models
122	* The complete set of models from which to draw our bag. First
123	* index is for the model, second is for the instance. The last
124	* is a prediction distribution for that instance. Models are
125	* represented by this array of predictions for validation data,
126	* since that's all ensemble selection needs to know.
127	* @param bag_percent
128	* The percentage of the set of given models that should be used
129	* in the Model Bag.
130	* @param debug
131	* Whether the ModelBag should print debug information.
132	*
133	*/
134	public ModelBag(double models[][][], double bag_percent, boolean debug) {
135	m_debug = debug;
136	if (models.length == 0) {
137	throw new IllegalArgumentException(
138	"ModelBag needs at least 1 model.");
139	}
140	m_bagSize = (int) ((double) models.length * bag_percent);
141	m_models = models;
142	m_modelIndex = new int[m_models.length];
143	m_timesChosen = new int[m_models.length];
144	m_bestTimesChosen = m_timesChosen;
145	m_bestPerformance = 0.0;
146
147	// Initially, no models are chosen.
148	m_numChosen = 0;
149	// Prepare our virtual indexing scheme. Initially, the indexes are
150	// the same as the original.
151	for (int i = 0; i < m_models.length; ++i) {
152	m_modelIndex[i] = i;
153	m_timesChosen[i] = 0;
154	}
155	}
156
157	/**
158	* Swap model at virtual index i with model at virtual index j. This is used
159	* to shuffle the models. We do not change m_models, only the arrays which
160	* use the virtual indexing; m_modelIndex and m_timesChosen.
161	*
162	* @param i first index
163	* @param j second index
164	*/
165	private void swap(int i, int j) {
166	if (i != j) {
167	int temp_index = m_modelIndex[i];
168	m_modelIndex[i] = m_modelIndex[j];
169	m_modelIndex[j] = temp_index;
170
171	int tempWeight = m_timesChosen[i];
172	m_timesChosen[i] = m_timesChosen[j];
173	m_timesChosen[j] = tempWeight;
174	}
175	}
176
177	/**
178	* Shuffle the models. The order in m_models is preserved, but we change our
179	* virtual indexes around.
180	*
181	* @param rand the random number generator to use
182	*/
183	public void shuffle(Random rand) {
184	if (m_models.length < 2)
185	return;
186
187	for (int i = 0; i < m_models.length; ++i) {
188	int swap_index = rand.nextInt(m_models.length - 1);
189	if (swap_index >= i)
190	++swap_index; // don't swap with itself
191	swap(i, swap_index);
192	}
193	}
194
195	/**
196	* Convert an array of weights using virtual indices to an array of weights
197	* using real indices.
198	*
199	* @param virtual_weights the virtual indices
200	* @return the real indices
201	*/
202	private int[] virtualToRealWeights(int virtual_weights[]) {
203	int real_weights[] = new int[virtual_weights.length];
204	for (int i = 0; i < real_weights.length; ++i) {
205	real_weights[m_modelIndex[i]] = virtual_weights[i];
206	}
207	return real_weights;
208	}
209
210	/**
211	*
212	*/
213	private void updateBestTimesChosen() {
214	m_bestTimesChosen = virtualToRealWeights(m_timesChosen);
215	}
216
217	/**
218	* Sort initialize the bag.
219	*
220	* @param num
221	* the Maximum number of models to initialize with
222	* @param greedy
223	* True if we do greedy addition, up to num. Greedy sort
224	* initialization adds models (up to num) in order of best to
225	* worst performance until performance no longer improves.
226	* @param instances
227	* the data set (needed for performance evaluation)
228	* @param metric
229	* metric for which to optimize. See EnsembleMetricHelper
230	* @return returns an array of indexes which were selected, in order
231	* starting from the model with best performance.
232	* @throws Exception if something goes wrong
233	*/
234	public int[] sortInitialize(int num, boolean greedy, Instances instances,
235	int metric) throws Exception {
236
237	// First, get the performance of each model
238	double performance[] = new double[m_bagSize];
239	for (int i = 0; i < m_bagSize; ++i) {
240	performance[i] = evaluatePredictions(instances, model(i), metric);
241	}
242	int bestModels[] = new int[num]; // we'll use this to save model info
243	// Now sort the models by their performance... note we only need the
244	// first "num",
245	// so we don't actually bother to sort the whole thing... instead, we
246	// pick the num best
247	// by running num iterations of selection sort.
248	for (int i = 0; i < num; ++i) {
249	int max_index = i;
250	double max_value = performance[i];
251	for (int j = i + 1; j < m_bagSize; ++j) {
252	// Find the best model which we haven't already selected
253	if (performance[j] > max_value) {
254	max_value = performance[j];
255	max_index = j;
256	}
257	}
258	// Swap ith model in to the ith position (selection sort)
259	this.swap(i, max_index);
260	// swap performance numbers, too
261	double temp_perf = performance[i];
262	performance[i] = performance[max_index];
263	performance[max_index] = temp_perf;
264
265	bestModels[i] = m_modelIndex[i];
266	if (!greedy) {
267	// If we're not being greedy, we just throw the model in
268	// no matter what
269	++m_timesChosen[i];
270	++m_numChosen;
271	}
272	}
273	// Now the best "num" models are all sorted and in position.
274	if (greedy) {
275	// If the "greedy" option was specified, do a smart sort
276	// initialization
277	// that adds models only so long as they help overall performance.
278	// This is what was done in the original Caruana paper.
279	double[][] tempPredictions = null;
280	double bestPerformance = 0.0;
281	if (num > 0) {
282	++m_timesChosen[0];
283	++m_numChosen;
284	updateBestTimesChosen();
285	}
286	for (int i = 1; i < num; ++i) {
287	tempPredictions = computePredictions(i, true);
288	double metric_value = evaluatePredictions(instances,
289	tempPredictions, metric);
290	if (metric_value > bestPerformance) {
291	// If performance improved, update the appropriate info.
292	bestPerformance = metric_value;
293	++m_timesChosen[i];
294	++m_numChosen;
295	updateBestTimesChosen();
296	} else {
297	// We found a model that doesn't help performance, so we
298	// stop adding models.
299	break;
300	}
301	}
302	}
303	updateBestTimesChosen();
304	if (m_debug) {
305	System.out.println("Sort Initialization added best " + m_numChosen
306	+ " models to the bag.");
307	}
308	return bestModels;
309	}
310
311	/**
312	* Add "weight" to the number of times each model in the bag was chosen.
313	* Typically for use with backward elimination.
314	*
315	* @param weight the weight to add
316	*/
317	public void weightAll(int weight) {
318	for (int i = 0; i < m_bagSize; ++i) {
319	m_timesChosen[i] += weight;
320	m_numChosen += weight;
321	}
322	updateBestTimesChosen();
323	}
324
325	/**
326	* Forward select one model. Will add the model which has the best effect on
327	* performance. If replacement is false, and all models are chosen, no
328	* action is taken. If a model can be added, one always is (even if it hurts
329	* performance).
330	*
331	* @param withReplacement
332	* whether a model can be added more than once.
333	* @param instances
334	* The dataset, for calculating performance.
335	* @param metric
336	* The metric to which we will optimize. See EnsembleMetricHelper
337	* @throws Exception if something goes wrong
338	*/
339	public void forwardSelect(boolean withReplacement, Instances instances,
340	int metric) throws Exception {
341
342	double bestPerformance = -1.0;
343	int bestIndex = -1;
344	double tempPredictions[][];
345	for (int i = 0; i < m_bagSize; ++i) {
346	// For each model in the bag
347	if ((m_timesChosen[i] == 0) \|\| withReplacement) {
348	// If the model has not been chosen, or we're allowing
349	// replacement
350	// Get the predictions we would have if we add this model to the
351	// ensemble
352	tempPredictions = computePredictions(i, true);
353	// And find out how the hypothetical ensemble would perform.
354	double metric_value = evaluatePredictions(instances,
355	tempPredictions, metric);
356	if (metric_value > bestPerformance) {
357	// If it's better than our current best, make it our NEW
358	// best.
359	bestIndex = i;
360	bestPerformance = metric_value;
361	}
362	}
363	}
364	if (bestIndex == -1) {
365	// Replacement must be false, with more hillclimb iterations than
366	// models. Do nothing and return.
367	if (m_debug) {
368	System.out.println("Couldn't add model. No action performed.");
369	}
370	return;
371	}
372	// We picked bestIndex as our best model. Update appropriate info.
373	m_timesChosen[bestIndex]++;
374	m_numChosen++;
375	if (bestPerformance > m_bestPerformance) {
376	// We find the peak of our performance over all hillclimb
377	// iterations.
378	// If this forwardSelect step improved our overall performance,
379	// update
380	// our best ensemble info.
381	updateBestTimesChosen();
382	m_bestPerformance = bestPerformance;
383	}
384	}
385
386	/**
387	* Find the model whose removal will help the ensemble's performance the
388	* most, and remove it. If there is only one model left, we leave it in. If
389	* we can remove a model, we always do, even if it hurts performance.
390	*
391	* @param instances
392	* The data set, for calculating performance
393	* @param metric
394	* Metric to optimize for. See EnsembleMetricHelper.
395	* @throws Exception if something goes wrong
396	*/
397	public void backwardEliminate(Instances instances, int metric)
398	throws Exception {
399
400	// Find the best model to remove. I.e., model for which removal improves
401	// performance the most (or hurts it least), and remove it.
402	if (m_numChosen <= 1) {
403	// If we only have one model left, keep it, as a bag
404	// which chooses no models doesn't make much sense.
405	return;
406	}
407	double bestPerformance = -1.0;
408	int bestIndex = -1;
409	double tempPredictions[][];
410	for (int i = 0; i < m_bagSize; ++i) {
411	// For each model in the bag
412	if (m_timesChosen[i] > 0) {
413	// If the model has been chosen at least once,
414	// Get the predictions we would have if we remove this model
415	tempPredictions = computePredictions(i, false);
416	// And find out how the hypothetical ensemble would perform.
417	double metric_value = evaluatePredictions(instances,
418	tempPredictions, metric);
419	if (metric_value > bestPerformance) {
420	// If it's better than our current best, make it our NEW
421	// best.
422	bestIndex = i;
423	bestPerformance = metric_value;
424	}
425	}
426	}
427	if (bestIndex == -1) {
428	// The most likely cause of this is that we didn't have any models
429	// we could
430	// remove. Do nothing & return.
431	if (m_debug) {
432	System.out
433	.println("Couldn't remove model. No action performed.");
434	}
435	return;
436	}
437	// We picked bestIndex as our best model. Update appropriate info.
438	m_timesChosen[bestIndex]--;
439	m_numChosen--;
440	if (m_debug) {
441	System.out.println("Removing model " + m_modelIndex[bestIndex]
442	+ " (" + bestIndex + ") " + bestPerformance);
443	}
444	if (bestPerformance > m_bestPerformance) {
445	// We find the peak of our performance over all hillclimb
446	// iterations.
447	// If this forwardSelect step improved our overall performance,
448	// update
449	// our best ensemble info.
450	updateBestTimesChosen();
451	m_bestPerformance = bestPerformance;
452	}
453	// return m_model_index[best_index]; //translate to original indexing
454	// and return
455	}
456
457	/**
458	* Find the best action to perform, be it adding a model or removing a
459	* model, and perform it. Some action is always performed, even if it hurts
460	* performance.
461	*
462	* @param with_replacement
463	* whether we can add a model more than once
464	* @param instances
465	* The dataset, for determining performance.
466	* @param metric
467	* The metric for which to optimize. See EnsembleMetricHelper.
468	* @throws Exception if something goes wrong
469	*/
470	public void forwardSelectOrBackwardEliminate(boolean with_replacement,
471	Instances instances, int metric) throws Exception {
472
473	// Find the best action to perform, be it adding a model or removing a
474	// model,
475	// and do it.
476	double bestPerformance = -1.0;
477	int bestIndex = -1;
478	boolean added = true;
479	double tempPredictions[][];
480	for (int i = 0; i < m_bagSize; ++i) {
481	// For each model in the bag:
482	// Try removing the model
483	if (m_timesChosen[i] > 0) {
484	// If the model has been chosen at least once,
485	// Get the predictions we would have if we remove this model
486	tempPredictions = computePredictions(i, false);
487	// And find out how the hypothetical ensemble would perform.
488	double metric_value = evaluatePredictions(instances,
489	tempPredictions, metric);
490	if (metric_value > bestPerformance) {
491	// If it's better than our current best, make it our NEW
492	// best.
493	bestIndex = i;
494	bestPerformance = metric_value;
495	added = false;
496	}
497	}
498	if ((m_timesChosen[i] == 0) \|\| with_replacement) {
499	// If the model hasn't been chosen, or if we can choose it more
500	// than once, try adding it:
501	// Get the predictions we would have if we added the model
502	tempPredictions = computePredictions(i, true);
503	// And find out how the hypothetical ensemble would perform.
504	double metric_value = evaluatePredictions(instances,
505	tempPredictions, metric);
506	if (metric_value > bestPerformance) {
507	// If it's better than our current best, make it our NEW
508	// best.
509	bestIndex = i;
510	bestPerformance = metric_value;
511	added = true;
512	}
513	}
514	}
515	if (bestIndex == -1) {
516	// Shouldn't really happen. Possible (I think) if the model bag is
517	// empty. Just return.
518	if (m_debug) {
519	System.out.println("Couldn't add or remove model. No action performed.");
520	}
521	return;
522	}
523	// Now we've found the best change to make:
524	// * bestIndex is the (virtual) index of the model we should change
525	// * added is true if the model should be added (false if should be
526	// removed)
527	int changeInWeight = added ? 1 : -1;
528	m_timesChosen[bestIndex] += changeInWeight;
529	m_numChosen += changeInWeight;
530	if (bestPerformance > m_bestPerformance) {
531	// We find the peak of our performance over all hillclimb
532	// iterations.
533	// If this forwardSelect step improved our overall performance,
534	// update
535	// our best ensemble info.
536	updateBestTimesChosen();
537	m_bestPerformance = bestPerformance;
538	}
539	}
540
541	/**
542	* returns the model weights
543	*
544	* @return the model weights
545	*/
546	public int[] getModelWeights() {
547	return m_bestTimesChosen;
548	}
549
550	/**
551	* Returns the "model" at the given virtual index. Here, by "model" we mean
552	* its predictions with respect to the validation set. This is just a
553	* convenience method, since we use the "virtual" index more than the real
554	* one inside this class.
555	*
556	* @param index
557	* the "virtual" index - the one for internal use
558	* @return the predictions for the model for all validation instances.
559	*/
560	private double[][] model(int index) {
561	return m_models[m_modelIndex[index]];
562	}
563
564	/**
565	* Compute predictions based on the current model, adding (or removing) the
566	* model at the given (internal) index.
567	*
568	* @param index_to_change
569	* index of model we're adding or removing
570	* @param add
571	* whether we add it. If false, we remove it.
572	* @return the predictions for all validation instances
573	*/
574	private double[][] computePredictions(int index_to_change, boolean add) {
575	double[][] predictions = new double[m_models[0].length][m_models[0][0].length];
576	for (int i = 0; i < m_bagSize; ++i) {
577	if (m_timesChosen[i] > 0) {
578	for (int j = 0; j < m_models[0].length; ++j) {
579	for (int k = 0; k < m_models[0][j].length; ++k) {
580	predictions[j][k] += model(i)[j][k] * m_timesChosen[i];
581	}
582	}
583	}
584	}
585	for (int j = 0; j < m_models[0].length; ++j) {
586	int change = add ? 1 : -1;
587	for (int k = 0; k < m_models[0][j].length; ++k) {
588	predictions[j][k] += change * model(index_to_change)[j][k];
589	predictions[j][k] /= (m_numChosen + change);
590	}
591	}
592	return predictions;
593	}
594
595	/**
596	* Return the performance of the given predictions on the given instances
597	* with respect to the given metric (see EnsembleMetricHelper).
598	*
599	* @param instances
600	* the validation data
601	* @param temp_predictions
602	* the predictions to evaluate
603	* @param metric
604	* the metric for which to optimize (see EnsembleMetricHelper)
605	* @return the performance
606	* @throws Exception if something goes wrong
607	*/
608	private double evaluatePredictions(Instances instances,
609	double[][] temp_predictions, int metric) throws Exception {
610
611	Evaluation eval = new Evaluation(instances);
612	for (int i = 0; i < instances.numInstances(); ++i) {
613	eval.evaluateModelOnceAndRecordPrediction(temp_predictions[i],
614	instances.instance(i));
615	}
616	return EnsembleMetricHelper.getMetric(eval, metric);
617	}
618
619	/**
620	* Gets the individual performances of all the models in the bag.
621	*
622	* @param instances
623	* The validation data, for which we want performance.
624	* @param metric
625	* The desired metric (see EnsembleMetricHelper).
626	* @return the performance
627	* @throws Exception if something goes wrong
628	*/
629	public double[] getIndividualPerformance(Instances instances, int metric)
630	throws Exception {
631
632	double[] performance = new double[m_bagSize];
633	for (int i = 0; i < m_bagSize; ++i) {
634	performance[i] = evaluatePredictions(instances, model(i), metric);
635	}
636	return performance;
637	}
638
639	/**
640	* Returns the revision string.
641	*
642	* @return the revision
643	*/
644	public String getRevision() {
645	return RevisionUtils.extract("$Revision: 1.2 $");
646	}
647	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: