Context Navigation

source: src/main/java/weka/classifiers/meta/EnsembleSelection.java @ 12

Last change on this file since 12 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 56.7 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* EnsembleSelection.java
19	* Copyright (C) 2006 David Michael
20	*
21	*/
22
23	package weka.classifiers.meta;
24
25	import weka.classifiers.Evaluation;
26	import weka.classifiers.RandomizableClassifier;
27	import weka.classifiers.meta.ensembleSelection.EnsembleMetricHelper;
28	import weka.classifiers.meta.ensembleSelection.EnsembleSelectionLibrary;
29	import weka.classifiers.meta.ensembleSelection.EnsembleSelectionLibraryModel;
30	import weka.classifiers.meta.ensembleSelection.ModelBag;
31	import weka.classifiers.trees.REPTree;
32	import weka.classifiers.xml.XMLClassifier;
33	import weka.core.Capabilities;
34	import weka.core.Instance;
35	import weka.core.Instances;
36	import weka.core.Option;
37	import weka.core.RevisionUtils;
38	import weka.core.SelectedTag;
39	import weka.core.Tag;
40	import weka.core.TechnicalInformation;
41	import weka.core.TechnicalInformationHandler;
42	import weka.core.Utils;
43	import weka.core.Capabilities.Capability;
44	import weka.core.TechnicalInformation.Field;
45	import weka.core.TechnicalInformation.Type;
46	import weka.core.xml.KOML;
47	import weka.core.xml.XMLOptions;
48	import weka.core.xml.XMLSerialization;
49
50	import java.io.BufferedInputStream;
51	import java.io.BufferedOutputStream;
52	import java.io.BufferedReader;
53	import java.io.File;
54	import java.io.FileInputStream;
55	import java.io.FileOutputStream;
56	import java.io.FileReader;
57	import java.io.InputStream;
58	import java.io.ObjectInputStream;
59	import java.io.ObjectOutputStream;
60	import java.io.OutputStream;
61	import java.util.Date;
62	import java.util.Enumeration;
63	import java.util.HashMap;
64	import java.util.Iterator;
65	import java.util.Map;
66	import java.util.Random;
67	import java.util.Set;
68	import java.util.Vector;
69	import java.util.zip.GZIPInputStream;
70	import java.util.zip.GZIPOutputStream;
71
72	/**
73	<!-- globalinfo-start -->
74	* Combines several classifiers using the ensemble selection method. For more information, see: Caruana, Rich, Niculescu, Alex, Crew, Geoff, and Ksikes, Alex, Ensemble Selection from Libraries of Models, The International Conference on Machine Learning (ICML'04), 2004. Implemented in Weka by Bob Jung and David Michael.
75	* <p/>
76	<!-- globalinfo-end -->
77	*
78	<!-- technical-bibtex-start -->
79	* BibTeX:
80	* <pre>
81	* @inproceedings{RichCaruana2004,
82	* author = {Rich Caruana, Alex Niculescu, Geoff Crew, and Alex Ksikes},
83	* booktitle = {21st International Conference on Machine Learning},
84	* title = {Ensemble Selection from Libraries of Models},
85	* year = {2004}
86	* }
87	* </pre>
88	* <p/>
89	<!-- technical-bibtex-end -->
90	*
91	* Our implementation of ensemble selection is a bit different from the other
92	* classifiers because we assume that the list of models to be trained is too
93	* large to fit in memory and that our base classifiers will need to be
94	* serialized to the file system (in the directory listed in the "workingDirectory
95	* option). We have adopted the term "model library" for this large set of
96	* classifiers keeping in line with the original paper.
97	* <p/>
98	*
99	* If you are planning to use this classifier, we highly recommend you take a
100	* quick look at our FAQ/tutorial on the WIKI. There are a few things that
101	* are unique to this classifier that could trip you up. Otherwise, this
102	* method is a great way to get really great classifier performance without
103	* having to do too much parameter tuning. What is nice is that in the worst
104	* case you get a nice summary of how s large number of diverse models
105	* performed on your data set.
106	* <p/>
107	*
108	* This class relies on the package weka.classifiers.meta.ensembleSelection.
109	* <p/>
110	*
111	* When run from the Explorer or another GUI, the classifier depends on the
112	* package weka.gui.libraryEditor.
113	* <p/>
114	*
115	<!-- options-start -->
116	* Valid options are: <p/>
117	*
118	* <pre> -L </path/to/modelLibrary>
119	* Specifies the Model Library File, continuing the list of all models.</pre>
120	*
121	* <pre> -W </path/to/working/directory>
122	* Specifies the Working Directory, where all models will be stored.</pre>
123	*
124	* <pre> -B <numModelBags>
125	* Set the number of bags, i.e., number of iterations to run
126	* the ensemble selection algorithm.</pre>
127	*
128	* <pre> -E <modelRatio>
129	* Set the ratio of library models that will be randomly chosen
130	* to populate each bag of models.</pre>
131	*
132	* <pre> -V <validationRatio>
133	* Set the ratio of the training data set that will be reserved
134	* for validation.</pre>
135	*
136	* <pre> -H <hillClimbIterations>
137	* Set the number of hillclimbing iterations to be performed
138	* on each model bag.</pre>
139	*
140	* <pre> -I <sortInitialization>
141	* Set the the ratio of the ensemble library that the sort
142	* initialization algorithm will be able to choose from while
143	* initializing the ensemble for each model bag</pre>
144	*
145	* <pre> -X <numFolds>
146	* Sets the number of cross-validation folds.</pre>
147	*
148	* <pre> -P <hillclimbMettric>
149	* Specify the metric that will be used for model selection
150	* during the hillclimbing algorithm.
151	* Valid metrics are:
152	* accuracy, rmse, roc, precision, recall, fscore, all</pre>
153	*
154	* <pre> -A <algorithm>
155	* Specifies the algorithm to be used for ensemble selection.
156	* Valid algorithms are:
157	* "forward" (default) for forward selection.
158	* "backward" for backward elimination.
159	* "both" for both forward and backward elimination.
160	* "best" to simply print out top performer from the
161	* ensemble library
162	* "library" to only train the models in the ensemble
163	* library</pre>
164	*
165	* <pre> -R
166	* Flag whether or not models can be selected more than once
167	* for an ensemble.</pre>
168	*
169	* <pre> -G
170	* Whether sort initialization greedily stops adding models
171	* when performance degrades.</pre>
172	*
173	* <pre> -O
174	* Flag for verbose output. Prints out performance of all
175	* selected models.</pre>
176	*
177	* <pre> -S <num>
178	* Random number seed.
179	* (default 1)</pre>
180	*
181	* <pre> -D
182	* If set, classifier is run in debug mode and
183	* may output additional info to the console</pre>
184	*
185	<!-- options-end -->
186	*
187	* @author Robert Jung
188	* @author David Michael
189	* @version $Revision: 5480 $
190	*/
191	public class EnsembleSelection
192	extends RandomizableClassifier
193	implements TechnicalInformationHandler {
194
195	/** for serialization */
196	private static final long serialVersionUID = -1744155148765058511L;
197
198	/**
199	* The Library of models, from which we can select our ensemble. Usually
200	* loaded from a model list file (.mlf or .model.xml) using the -L
201	* command-line option.
202	*/
203	protected EnsembleSelectionLibrary m_library = new EnsembleSelectionLibrary();
204
205	/**
206	* List of models chosen by EnsembleSelection. Populated by buildClassifier.
207	*/
208	protected EnsembleSelectionLibraryModel[] m_chosen_models = null;
209
210	/**
211	* An array of weights for the chosen models. Elements are parallel to those
212	* in m_chosen_models. That is, m_chosen_model_weights[i] is the weight
213	* associated with the model at m_chosen_models[i].
214	*/
215	protected int[] m_chosen_model_weights = null;
216
217	/** Total weight of all chosen models. */
218	protected int m_total_weight = 0;
219
220	/**
221	* ratio of library models that will be randomly chosen to be used for each
222	* model bag
223	*/
224	protected double m_modelRatio = 0.5;
225
226	/**
227	* Indicates the fraction of the given training set that should be used for
228	* hillclimbing/validation. This fraction is set aside and not used for
229	* training. It is assumed that any loaded models were also not trained on
230	* set-aside data. (If the same percentage and random seed were used
231	* previously to train the models in the library, this will work as expected -
232	* i.e., those models will be valid)
233	*/
234	protected double m_validationRatio = 0.25;
235
236	/** defines metrics that can be chosen for hillclimbing */
237	public static final Tag[] TAGS_METRIC = {
238	new Tag(EnsembleMetricHelper.METRIC_ACCURACY, "Optimize with Accuracy"),
239	new Tag(EnsembleMetricHelper.METRIC_RMSE, "Optimize with RMSE"),
240	new Tag(EnsembleMetricHelper.METRIC_ROC, "Optimize with ROC"),
241	new Tag(EnsembleMetricHelper.METRIC_PRECISION, "Optimize with precision"),
242	new Tag(EnsembleMetricHelper.METRIC_RECALL, "Optimize with recall"),
243	new Tag(EnsembleMetricHelper.METRIC_FSCORE, "Optimize with fscore"),
244	new Tag(EnsembleMetricHelper.METRIC_ALL, "Optimize with all metrics"), };
245
246	/**
247	* The "enumeration" of the algorithms we can use. Forward - forward
248	* selection. For hillclimb iterations,
249	*/
250	public static final int ALGORITHM_FORWARD = 0;
251
252	public static final int ALGORITHM_BACKWARD = 1;
253
254	public static final int ALGORITHM_FORWARD_BACKWARD = 2;
255
256	public static final int ALGORITHM_BEST = 3;
257
258	public static final int ALGORITHM_BUILD_LIBRARY = 4;
259
260	/** defines metrics that can be chosen for hillclimbing */
261	public static final Tag[] TAGS_ALGORITHM = {
262	new Tag(ALGORITHM_FORWARD, "Forward selection"),
263	new Tag(ALGORITHM_BACKWARD, "Backward elimation"),
264	new Tag(ALGORITHM_FORWARD_BACKWARD, "Forward Selection + Backward Elimination"),
265	new Tag(ALGORITHM_BEST, "Best model"),
266	new Tag(ALGORITHM_BUILD_LIBRARY, "Build Library Only") };
267
268	/**
269	* this specifies the number of "Ensembl-X" directories that are allowed to
270	* be created in the users home directory where X is the number of the
271	* ensemble
272	*/
273	private static final int MAX_DEFAULT_DIRECTORIES = 1000;
274
275	/**
276	* The name of the Model Library File (if one is specified) which lists
277	* models from which ensemble selection will choose. This is only used when
278	* run from the command-line, as otherwise m_library is responsible for
279	* this.
280	*/
281	protected String m_modelLibraryFileName = null;
282
283	/**
284	* The number of "model bags". Using 1 is equivalent to no bagging at all.
285	*/
286	protected int m_numModelBags = 10;
287
288	/** The metric for which the ensemble will be optimized. */
289	protected int m_hillclimbMetric = EnsembleMetricHelper.METRIC_RMSE;
290
291	/** The algorithm used for ensemble selection. */
292	protected int m_algorithm = ALGORITHM_FORWARD;
293
294	/**
295	* number of hillclimbing iterations for the ensemble selection algorithm
296	*/
297	protected int m_hillclimbIterations = 100;
298
299	/** ratio of library models to be used for sort initialization */
300	protected double m_sortInitializationRatio = 1.0;
301
302	/**
303	* specifies whether or not the ensemble algorithm is allowed to include a
304	* specific model in the library more than once in each ensemble
305	*/
306	protected boolean m_replacement = true;
307
308	/**
309	* specifies whether we use "greedy" sort initialization. If false, we
310	* simply add the best m_sortInitializationRatio models of the bag blindly.
311	* If true, we add the best models in order up to m_sortInitializationRatio
312	* until adding the next model would not help performance.
313	*/
314	protected boolean m_greedySortInitialization = true;
315
316	/**
317	* Specifies whether or not we will output metrics for all models
318	*/
319	protected boolean m_verboseOutput = false;
320
321	/**
322	* Hash map of cached predictions. The key is a stringified Instance. Each
323	* entry is a 2d array, first indexed by classifier index (i.e., the one
324	* used in m_chosen_model). The second index is the usual "distribution"
325	* index across classes.
326	*/
327	protected Map m_cachedPredictions = null;
328
329	/**
330	* This string will store the working directory where all models , temporary
331	* prediction values, and modellist logs are to be built and stored.
332	*/
333	protected File m_workingDirectory = new File(getDefaultWorkingDirectory());
334
335	/**
336	* Indicates the number of folds for cross-validation. A value of 1
337	* indicates there is no cross-validation. Cross validation is done in the
338	* "embedded" fashion described by Caruana, Niculescu, and Munson
339	* (unpublished work - tech report forthcoming)
340	*/
341	protected int m_NumFolds = 1;
342
343	/**
344	* Returns a string describing classifier
345	*
346	* @return a description suitable for displaying in the
347	* explorer/experimenter gui
348	*/
349	public String globalInfo() {
350
351	return "Combines several classifiers using the ensemble "
352	+ "selection method. For more information, see: "
353	+ "Caruana, Rich, Niculescu, Alex, Crew, Geoff, and Ksikes, Alex, "
354	+ "Ensemble Selection from Libraries of Models, "
355	+ "The International Conference on Machine Learning (ICML'04), 2004. "
356	+ "Implemented in Weka by Bob Jung and David Michael.";
357	}
358
359	/**
360	* Returns an enumeration describing the available options.
361	*
362	* @return an enumeration of all the available options.
363	*/
364	public Enumeration listOptions() {
365	Vector result = new Vector();
366
367	result.addElement(new Option(
368	"\tSpecifies the Model Library File, continuing the list of all models.",
369	"L", 1, "-L </path/to/modelLibrary>"));
370
371	result.addElement(new Option(
372	"\tSpecifies the Working Directory, where all models will be stored.",
373	"W", 1, "-W </path/to/working/directory>"));
374
375	result.addElement(new Option(
376	"\tSet the number of bags, i.e., number of iterations to run \n"
377	+ "\tthe ensemble selection algorithm.",
378	"B", 1, "-B <numModelBags>"));
379
380	result.addElement(new Option(
381	"\tSet the ratio of library models that will be randomly chosen \n"
382	+ "\tto populate each bag of models.",
383	"E", 1, "-E <modelRatio>"));
384
385	result.addElement(new Option(
386	"\tSet the ratio of the training data set that will be reserved \n"
387	+ "\tfor validation.",
388	"V", 1, "-V <validationRatio>"));
389
390	result.addElement(new Option(
391	"\tSet the number of hillclimbing iterations to be performed \n"
392	+ "\ton each model bag.",
393	"H", 1, "-H <hillClimbIterations>"));
394
395	result.addElement(new Option(
396	"\tSet the the ratio of the ensemble library that the sort \n"
397	+ "\tinitialization algorithm will be able to choose from while \n"
398	+ "\tinitializing the ensemble for each model bag",
399	"I", 1, "-I <sortInitialization>"));
400
401	result.addElement(new Option(
402	"\tSets the number of cross-validation folds.",
403	"X", 1, "-X <numFolds>"));
404
405	result.addElement(new Option(
406	"\tSpecify the metric that will be used for model selection \n"
407	+ "\tduring the hillclimbing algorithm.\n"
408	+ "\tValid metrics are: \n"
409	+ "\t\taccuracy, rmse, roc, precision, recall, fscore, all",
410	"P", 1, "-P <hillclimbMettric>"));
411
412	result.addElement(new Option(
413	"\tSpecifies the algorithm to be used for ensemble selection. \n"
414	+ "\tValid algorithms are:\n"
415	+ "\t\t\"forward\" (default) for forward selection.\n"
416	+ "\t\t\"backward\" for backward elimination.\n"
417	+ "\t\t\"both\" for both forward and backward elimination.\n"
418	+ "\t\t\"best\" to simply print out top performer from the \n"
419	+ "\t\t ensemble library\n"
420	+ "\t\t\"library\" to only train the models in the ensemble \n"
421	+ "\t\t library",
422	"A", 1, "-A <algorithm>"));
423
424	result.addElement(new Option(
425	"\tFlag whether or not models can be selected more than once \n"
426	+ "\tfor an ensemble.",
427	"R", 0, "-R"));
428
429	result.addElement(new Option(
430	"\tWhether sort initialization greedily stops adding models \n"
431	+ "\twhen performance degrades.",
432	"G", 0, "-G"));
433
434	result.addElement(new Option(
435	"\tFlag for verbose output. Prints out performance of all \n"
436	+ "\tselected models.",
437	"O", 0, "-O"));
438
439	// TODO - Add more options here
440	Enumeration enu = super.listOptions();
441	while (enu.hasMoreElements()) {
442	result.addElement(enu.nextElement());
443	}
444
445	return result.elements();
446	}
447
448	/**
449	* We return true for basically everything except for Missing class values,
450	* because we can't really answer for all the models in our library. If any of
451	* them don't work with the supplied data then we just trap the exception.
452	*
453	* @return the capabilities of this classifier
454	*/
455	public Capabilities getCapabilities() {
456	Capabilities result = super.getCapabilities(); // returns the object
457	result.disableAll();
458	// from
459	// weka.classifiers.Classifier
460
461	// attributes
462	result.enable(Capability.NOMINAL_ATTRIBUTES);
463	result.enable(Capability.NUMERIC_ATTRIBUTES);
464	result.enable(Capability.DATE_ATTRIBUTES);
465	result.enable(Capability.MISSING_VALUES);
466	result.enable(Capability.BINARY_ATTRIBUTES);
467
468	// class
469	result.enable(Capability.NOMINAL_CLASS);
470	result.enable(Capability.NUMERIC_CLASS);
471	result.enable(Capability.BINARY_CLASS);
472
473	return result;
474	}
475
476	/**
477	<!-- options-start -->
478	* Valid options are: <p/>
479	*
480	* <pre> -L </path/to/modelLibrary>
481	* Specifies the Model Library File, continuing the list of all models.</pre>
482	*
483	* <pre> -W </path/to/working/directory>
484	* Specifies the Working Directory, where all models will be stored.</pre>
485	*
486	* <pre> -B <numModelBags>
487	* Set the number of bags, i.e., number of iterations to run
488	* the ensemble selection algorithm.</pre>
489	*
490	* <pre> -E <modelRatio>
491	* Set the ratio of library models that will be randomly chosen
492	* to populate each bag of models.</pre>
493	*
494	* <pre> -V <validationRatio>
495	* Set the ratio of the training data set that will be reserved
496	* for validation.</pre>
497	*
498	* <pre> -H <hillClimbIterations>
499	* Set the number of hillclimbing iterations to be performed
500	* on each model bag.</pre>
501	*
502	* <pre> -I <sortInitialization>
503	* Set the the ratio of the ensemble library that the sort
504	* initialization algorithm will be able to choose from while
505	* initializing the ensemble for each model bag</pre>
506	*
507	* <pre> -X <numFolds>
508	* Sets the number of cross-validation folds.</pre>
509	*
510	* <pre> -P <hillclimbMettric>
511	* Specify the metric that will be used for model selection
512	* during the hillclimbing algorithm.
513	* Valid metrics are:
514	* accuracy, rmse, roc, precision, recall, fscore, all</pre>
515	*
516	* <pre> -A <algorithm>
517	* Specifies the algorithm to be used for ensemble selection.
518	* Valid algorithms are:
519	* "forward" (default) for forward selection.
520	* "backward" for backward elimination.
521	* "both" for both forward and backward elimination.
522	* "best" to simply print out top performer from the
523	* ensemble library
524	* "library" to only train the models in the ensemble
525	* library</pre>
526	*
527	* <pre> -R
528	* Flag whether or not models can be selected more than once
529	* for an ensemble.</pre>
530	*
531	* <pre> -G
532	* Whether sort initialization greedily stops adding models
533	* when performance degrades.</pre>
534	*
535	* <pre> -O
536	* Flag for verbose output. Prints out performance of all
537	* selected models.</pre>
538	*
539	* <pre> -S <num>
540	* Random number seed.
541	* (default 1)</pre>
542	*
543	* <pre> -D
544	* If set, classifier is run in debug mode and
545	* may output additional info to the console</pre>
546	*
547	<!-- options-end -->
548	*
549	* @param options
550	* the list of options as an array of strings
551	* @throws Exception
552	* if an option is not supported
553	*/
554	public void setOptions(String[] options) throws Exception {
555	String tmpStr;
556
557	tmpStr = Utils.getOption('L', options);
558	if (tmpStr.length() != 0) {
559	m_modelLibraryFileName = tmpStr;
560	m_library = new EnsembleSelectionLibrary(m_modelLibraryFileName);
561	} else {
562	setLibrary(new EnsembleSelectionLibrary());
563	// setLibrary(new Library(super.m_Classifiers));
564	}
565
566	tmpStr = Utils.getOption('W', options);
567	if (tmpStr.length() != 0 && validWorkingDirectory(tmpStr)) {
568	m_workingDirectory = new File(tmpStr);
569	} else {
570	m_workingDirectory = new File(getDefaultWorkingDirectory());
571	}
572	m_library.setWorkingDirectory(m_workingDirectory);
573
574	tmpStr = Utils.getOption('E', options);
575	if (tmpStr.length() != 0) {
576	setModelRatio(Double.parseDouble(tmpStr));
577	} else {
578	setModelRatio(1.0);
579	}
580
581	tmpStr = Utils.getOption('V', options);
582	if (tmpStr.length() != 0) {
583	setValidationRatio(Double.parseDouble(tmpStr));
584	} else {
585	setValidationRatio(0.25);
586	}
587
588	tmpStr = Utils.getOption('B', options);
589	if (tmpStr.length() != 0) {
590	setNumModelBags(Integer.parseInt(tmpStr));
591	} else {
592	setNumModelBags(10);
593	}
594
595	tmpStr = Utils.getOption('H', options);
596	if (tmpStr.length() != 0) {
597	setHillclimbIterations(Integer.parseInt(tmpStr));
598	} else {
599	setHillclimbIterations(100);
600	}
601
602	tmpStr = Utils.getOption('I', options);
603	if (tmpStr.length() != 0) {
604	setSortInitializationRatio(Double.parseDouble(tmpStr));
605	} else {
606	setSortInitializationRatio(1.0);
607	}
608
609	tmpStr = Utils.getOption('X', options);
610	if (tmpStr.length() != 0) {
611	setNumFolds(Integer.parseInt(tmpStr));
612	} else {
613	setNumFolds(10);
614	}
615
616	setReplacement(Utils.getFlag('R', options));
617
618	setGreedySortInitialization(Utils.getFlag('G', options));
619
620	setVerboseOutput(Utils.getFlag('O', options));
621
622	tmpStr = Utils.getOption('P', options);
623	// if (hillclimbMetricString.length() != 0) {
624
625	if (tmpStr.toLowerCase().equals("accuracy")) {
626	setHillclimbMetric(new SelectedTag(
627	EnsembleMetricHelper.METRIC_ACCURACY, TAGS_METRIC));
628	} else if (tmpStr.toLowerCase().equals("rmse")) {
629	setHillclimbMetric(new SelectedTag(
630	EnsembleMetricHelper.METRIC_RMSE, TAGS_METRIC));
631	} else if (tmpStr.toLowerCase().equals("roc")) {
632	setHillclimbMetric(new SelectedTag(
633	EnsembleMetricHelper.METRIC_ROC, TAGS_METRIC));
634	} else if (tmpStr.toLowerCase().equals("precision")) {
635	setHillclimbMetric(new SelectedTag(
636	EnsembleMetricHelper.METRIC_PRECISION, TAGS_METRIC));
637	} else if (tmpStr.toLowerCase().equals("recall")) {
638	setHillclimbMetric(new SelectedTag(
639	EnsembleMetricHelper.METRIC_RECALL, TAGS_METRIC));
640	} else if (tmpStr.toLowerCase().equals("fscore")) {
641	setHillclimbMetric(new SelectedTag(
642	EnsembleMetricHelper.METRIC_FSCORE, TAGS_METRIC));
643	} else if (tmpStr.toLowerCase().equals("all")) {
644	setHillclimbMetric(new SelectedTag(
645	EnsembleMetricHelper.METRIC_ALL, TAGS_METRIC));
646	} else {
647	setHillclimbMetric(new SelectedTag(
648	EnsembleMetricHelper.METRIC_RMSE, TAGS_METRIC));
649	}
650
651	tmpStr = Utils.getOption('A', options);
652	if (tmpStr.toLowerCase().equals("forward")) {
653	setAlgorithm(new SelectedTag(ALGORITHM_FORWARD, TAGS_ALGORITHM));
654	} else if (tmpStr.toLowerCase().equals("backward")) {
655	setAlgorithm(new SelectedTag(ALGORITHM_BACKWARD, TAGS_ALGORITHM));
656	} else if (tmpStr.toLowerCase().equals("both")) {
657	setAlgorithm(new SelectedTag(ALGORITHM_FORWARD_BACKWARD, TAGS_ALGORITHM));
658	} else if (tmpStr.toLowerCase().equals("forward")) {
659	setAlgorithm(new SelectedTag(ALGORITHM_FORWARD, TAGS_ALGORITHM));
660	} else if (tmpStr.toLowerCase().equals("best")) {
661	setAlgorithm(new SelectedTag(ALGORITHM_BEST, TAGS_ALGORITHM));
662	} else if (tmpStr.toLowerCase().equals("library")) {
663	setAlgorithm(new SelectedTag(ALGORITHM_BUILD_LIBRARY, TAGS_ALGORITHM));
664	} else {
665	setAlgorithm(new SelectedTag(ALGORITHM_FORWARD, TAGS_ALGORITHM));
666	}
667
668	super.setOptions(options);
669
670	m_library.setDebug(m_Debug);
671	}
672
673
674	/**
675	* Gets the current settings of the Classifier.
676	*
677	* @return an array of strings suitable for passing to setOptions
678	*/
679	public String[] getOptions() {
680	Vector result;
681	String[] options;
682	int i;
683
684	result = new Vector();
685
686	if (m_library.getModelListFile() != null) {
687	result.add("-L");
688	result.add("" + m_library.getModelListFile());
689	}
690
691	if (!m_workingDirectory.equals("")) {
692	result.add("-W");
693	result.add("" + getWorkingDirectory());
694	}
695
696	result.add("-P");
697	switch (getHillclimbMetric().getSelectedTag().getID()) {
698	case (EnsembleMetricHelper.METRIC_ACCURACY):
699	result.add("accuracy");
700	break;
701	case (EnsembleMetricHelper.METRIC_RMSE):
702	result.add("rmse");
703	break;
704	case (EnsembleMetricHelper.METRIC_ROC):
705	result.add("roc");
706	break;
707	case (EnsembleMetricHelper.METRIC_PRECISION):
708	result.add("precision");
709	break;
710	case (EnsembleMetricHelper.METRIC_RECALL):
711	result.add("recall");
712	break;
713	case (EnsembleMetricHelper.METRIC_FSCORE):
714	result.add("fscore");
715	break;
716	case (EnsembleMetricHelper.METRIC_ALL):
717	result.add("all");
718	break;
719	}
720
721	result.add("-A");
722	switch (getAlgorithm().getSelectedTag().getID()) {
723	case (ALGORITHM_FORWARD):
724	result.add("forward");
725	break;
726	case (ALGORITHM_BACKWARD):
727	result.add("backward");
728	break;
729	case (ALGORITHM_FORWARD_BACKWARD):
730	result.add("both");
731	break;
732	case (ALGORITHM_BEST):
733	result.add("best");
734	break;
735	case (ALGORITHM_BUILD_LIBRARY):
736	result.add("library");
737	break;
738	}
739
740	result.add("-B");
741	result.add("" + getNumModelBags());
742	result.add("-V");
743	result.add("" + getValidationRatio());
744	result.add("-E");
745	result.add("" + getModelRatio());
746	result.add("-H");
747	result.add("" + getHillclimbIterations());
748	result.add("-I");
749	result.add("" + getSortInitializationRatio());
750	result.add("-X");
751	result.add("" + getNumFolds());
752
753	if (m_replacement)
754	result.add("-R");
755	if (m_greedySortInitialization)
756	result.add("-G");
757	if (m_verboseOutput)
758	result.add("-O");
759
760	options = super.getOptions();
761	for (i = 0; i < options.length; i++)
762	result.add(options[i]);
763
764	return (String[]) result.toArray(new String[result.size()]);
765	}
766
767	/**
768	* Returns the tip text for this property
769	*
770	* @return tip text for this property suitable for displaying in the
771	* explorer/experimenter gui
772	*/
773	public String numFoldsTipText() {
774	return "The number of folds used for cross-validation.";
775	}
776
777	/**
778	* Gets the number of folds for the cross-validation.
779	*
780	* @return the number of folds for the cross-validation
781	*/
782	public int getNumFolds() {
783	return m_NumFolds;
784	}
785
786	/**
787	* Sets the number of folds for the cross-validation.
788	*
789	* @param numFolds
790	* the number of folds for the cross-validation
791	* @throws Exception
792	* if parameter illegal
793	*/
794	public void setNumFolds(int numFolds) throws Exception {
795	if (numFolds < 0) {
796	throw new IllegalArgumentException(
797	"EnsembleSelection: Number of cross-validation "
798	+ "folds must be positive.");
799	}
800	m_NumFolds = numFolds;
801	}
802
803	/**
804	* Returns the tip text for this property
805	*
806	* @return tip text for this property suitable for displaying in the
807	* explorer/experimenter gui
808	*/
809	public String libraryTipText() {
810	return "An ensemble library.";
811	}
812
813	/**
814	* Gets the ensemble library.
815	*
816	* @return the ensemble library
817	*/
818	public EnsembleSelectionLibrary getLibrary() {
819	return m_library;
820	}
821
822	/**
823	* Sets the ensemble library.
824	*
825	* @param newLibrary
826	* the ensemble library
827	*/
828	public void setLibrary(EnsembleSelectionLibrary newLibrary) {
829	m_library = newLibrary;
830	m_library.setDebug(m_Debug);
831	}
832
833	/**
834	* Returns the tip text for this property
835	*
836	* @return tip text for this property suitable for displaying in the
837	* explorer/experimenter gui
838	*/
839	public String modelRatioTipText() {
840	return "The ratio of library models that will be randomly chosen to be used for each iteration.";
841	}
842
843	/**
844	* Get the value of modelRatio.
845	*
846	* @return Value of modelRatio.
847	*/
848	public double getModelRatio() {
849	return m_modelRatio;
850	}
851
852	/**
853	* Set the value of modelRatio.
854	*
855	* @param v
856	* Value to assign to modelRatio.
857	*/
858	public void setModelRatio(double v) {
859	m_modelRatio = v;
860	}
861
862	/**
863	* Returns the tip text for this property
864	*
865	* @return tip text for this property suitable for displaying in the
866	* explorer/experimenter gui
867	*/
868	public String validationRatioTipText() {
869	return "The ratio of the training data set that will be reserved for validation.";
870	}
871
872	/**
873	* Get the value of validationRatio.
874	*
875	* @return Value of validationRatio.
876	*/
877	public double getValidationRatio() {
878	return m_validationRatio;
879	}
880
881	/**
882	* Set the value of validationRatio.
883	*
884	* @param v
885	* Value to assign to validationRatio.
886	*/
887	public void setValidationRatio(double v) {
888	m_validationRatio = v;
889	}
890
891	/**
892	* Returns the tip text for this property
893	*
894	* @return tip text for this property suitable for displaying in the
895	* explorer/experimenter gui
896	*/
897	public String hillclimbMetricTipText() {
898	return "the metric that will be used to optimizer the chosen ensemble..";
899	}
900
901	/**
902	* Gets the hill climbing metric. Will be one of METRIC_ACCURACY,
903	* METRIC_RMSE, METRIC_ROC, METRIC_PRECISION, METRIC_RECALL, METRIC_FSCORE,
904	* METRIC_ALL
905	*
906	* @return the hillclimbMetric
907	*/
908	public SelectedTag getHillclimbMetric() {
909	return new SelectedTag(m_hillclimbMetric, TAGS_METRIC);
910	}
911
912	/**
913	* Sets the hill climbing metric. Will be one of METRIC_ACCURACY,
914	* METRIC_RMSE, METRIC_ROC, METRIC_PRECISION, METRIC_RECALL, METRIC_FSCORE,
915	* METRIC_ALL
916	*
917	* @param newType
918	* the new hillclimbMetric
919	*/
920	public void setHillclimbMetric(SelectedTag newType) {
921	if (newType.getTags() == TAGS_METRIC) {
922	m_hillclimbMetric = newType.getSelectedTag().getID();
923	}
924	}
925
926	/**
927	* Returns the tip text for this property
928	*
929	* @return tip text for this property suitable for displaying in the
930	* explorer/experimenter gui
931	*/
932	public String algorithmTipText() {
933	return "the algorithm used to optimizer the ensemble";
934	}
935
936	/**
937	* Gets the algorithm
938	*
939	* @return the algorithm
940	*/
941	public SelectedTag getAlgorithm() {
942	return new SelectedTag(m_algorithm, TAGS_ALGORITHM);
943	}
944
945	/**
946	* Sets the Algorithm to use
947	*
948	* @param newType
949	* the new algorithm
950	*/
951	public void setAlgorithm(SelectedTag newType) {
952	if (newType.getTags() == TAGS_ALGORITHM) {
953	m_algorithm = newType.getSelectedTag().getID();
954	}
955	}
956
957	/**
958	* Returns the tip text for this property
959	*
960	* @return tip text for this property suitable for displaying in the
961	* explorer/experimenter gui
962	*/
963	public String hillclimbIterationsTipText() {
964	return "The number of hillclimbing iterations for the ensemble selection algorithm.";
965	}
966
967	/**
968	* Gets the number of hillclimbIterations.
969	*
970	* @return the number of hillclimbIterations
971	*/
972	public int getHillclimbIterations() {
973	return m_hillclimbIterations;
974	}
975
976	/**
977	* Sets the number of hillclimbIterations.
978	*
979	* @param n
980	* the number of hillclimbIterations
981	* @throws Exception
982	* if parameter illegal
983	*/
984	public void setHillclimbIterations(int n) throws Exception {
985	if (n < 0) {
986	throw new IllegalArgumentException(
987	"EnsembleSelection: Number of hillclimb iterations "
988	+ "must be positive.");
989	}
990	m_hillclimbIterations = n;
991	}
992
993	/**
994	* Returns the tip text for this property
995	*
996	* @return tip text for this property suitable for displaying in the
997	* explorer/experimenter gui
998	*/
999	public String numModelBagsTipText() {
1000	return "The number of \"model bags\" used in the ensemble selection algorithm.";
1001	}
1002
1003	/**
1004	* Gets numModelBags.
1005	*
1006	* @return numModelBags
1007	*/
1008	public int getNumModelBags() {
1009	return m_numModelBags;
1010	}
1011
1012	/**
1013	* Sets numModelBags.
1014	*
1015	* @param n
1016	* the new value for numModelBags
1017	* @throws Exception
1018	* if parameter illegal
1019	*/
1020	public void setNumModelBags(int n) throws Exception {
1021	if (n <= 0) {
1022	throw new IllegalArgumentException(
1023	"EnsembleSelection: Number of model bags "
1024	+ "must be positive.");
1025	}
1026	m_numModelBags = n;
1027	}
1028
1029	/**
1030	* Returns the tip text for this property
1031	*
1032	* @return tip text for this property suitable for displaying in the
1033	* explorer/experimenter gui
1034	*/
1035	public String sortInitializationRatioTipText() {
1036	return "The ratio of library models to be used for sort initialization.";
1037	}
1038
1039	/**
1040	* Get the value of sortInitializationRatio.
1041	*
1042	* @return Value of sortInitializationRatio.
1043	*/
1044	public double getSortInitializationRatio() {
1045	return m_sortInitializationRatio;
1046	}
1047
1048	/**
1049	* Set the value of sortInitializationRatio.
1050	*
1051	* @param v
1052	* Value to assign to sortInitializationRatio.
1053	*/
1054	public void setSortInitializationRatio(double v) {
1055	m_sortInitializationRatio = v;
1056	}
1057
1058	/**
1059	* Returns the tip text for this property
1060	*
1061	* @return tip text for this property suitable for displaying in the
1062	* explorer/experimenter gui
1063	*/
1064	public String replacementTipText() {
1065	return "Whether models in the library can be included more than once in an ensemble.";
1066	}
1067
1068	/**
1069	* Get the value of replacement.
1070	*
1071	* @return Value of replacement.
1072	*/
1073	public boolean getReplacement() {
1074	return m_replacement;
1075	}
1076
1077	/**
1078	* Set the value of replacement.
1079	*
1080	* @param newReplacement
1081	* Value to assign to replacement.
1082	*/
1083	public void setReplacement(boolean newReplacement) {
1084	m_replacement = newReplacement;
1085	}
1086
1087	/**
1088	* Returns the tip text for this property
1089	*
1090	* @return tip text for this property suitable for displaying in the
1091	* explorer/experimenter gui
1092	*/
1093	public String greedySortInitializationTipText() {
1094	return "Whether sort initialization greedily stops adding models when performance degrades.";
1095	}
1096
1097	/**
1098	* Get the value of greedySortInitialization.
1099	*
1100	* @return Value of replacement.
1101	*/
1102	public boolean getGreedySortInitialization() {
1103	return m_greedySortInitialization;
1104	}
1105
1106	/**
1107	* Set the value of greedySortInitialization.
1108	*
1109	* @param newGreedySortInitialization
1110	* Value to assign to replacement.
1111	*/
1112	public void setGreedySortInitialization(boolean newGreedySortInitialization) {
1113	m_greedySortInitialization = newGreedySortInitialization;
1114	}
1115
1116	/**
1117	* Returns the tip text for this property
1118	*
1119	* @return tip text for this property suitable for displaying in the
1120	* explorer/experimenter gui
1121	*/
1122	public String verboseOutputTipText() {
1123	return "Whether metrics are printed for each model.";
1124	}
1125
1126	/**
1127	* Get the value of verboseOutput.
1128	*
1129	* @return Value of verboseOutput.
1130	*/
1131	public boolean getVerboseOutput() {
1132	return m_verboseOutput;
1133	}
1134
1135	/**
1136	* Set the value of verboseOutput.
1137	*
1138	* @param newVerboseOutput
1139	* Value to assign to verboseOutput.
1140	*/
1141	public void setVerboseOutput(boolean newVerboseOutput) {
1142	m_verboseOutput = newVerboseOutput;
1143	}
1144
1145	/**
1146	* Returns the tip text for this property
1147	*
1148	* @return tip text for this property suitable for displaying in the
1149	* explorer/experimenter gui
1150	*/
1151	public String workingDirectoryTipText() {
1152	return "The working directory of the ensemble - where trained models will be stored.";
1153	}
1154
1155	/**
1156	* Get the value of working directory.
1157	*
1158	* @return Value of working directory.
1159	*/
1160	public File getWorkingDirectory() {
1161	return m_workingDirectory;
1162	}
1163
1164	/**
1165	* Set the value of working directory.
1166	*
1167	* @param newWorkingDirectory directory Value.
1168	*/
1169	public void setWorkingDirectory(File newWorkingDirectory) {
1170	if (m_Debug) {
1171	System.out.println("working directory changed to: "
1172	+ newWorkingDirectory);
1173	}
1174	m_library.setWorkingDirectory(newWorkingDirectory);
1175
1176	m_workingDirectory = newWorkingDirectory;
1177	}
1178
1179	/**
1180	* Buildclassifier selects a classifier from the set of classifiers by
1181	* minimising error on the training data.
1182	*
1183	* @param trainData the training data to be used for generating the boosted
1184	* classifier.
1185	* @throws Exception if the classifier could not be built successfully
1186	*/
1187	public void buildClassifier(Instances trainData) throws Exception {
1188
1189	getCapabilities().testWithFail(trainData);
1190
1191	// First we need to make sure that some library models
1192	// were specified. If not, then use the default list
1193	if (m_library.m_Models.size() == 0) {
1194
1195	System.out
1196	.println("WARNING: No library file specified. Using some default models.");
1197	System.out
1198	.println("You should specify a model list with -L <file> from the command line.");
1199	System.out
1200	.println("Or edit the list directly with the LibraryEditor from the GUI");
1201
1202	for (int i = 0; i < 10; i++) {
1203
1204	REPTree tree = new REPTree();
1205	tree.setSeed(i);
1206	m_library.addModel(new EnsembleSelectionLibraryModel(tree));
1207
1208	}
1209
1210	}
1211
1212	if (m_library == null) {
1213	m_library = new EnsembleSelectionLibrary();
1214	m_library.setDebug(m_Debug);
1215	}
1216
1217	m_library.setNumFolds(getNumFolds());
1218	m_library.setValidationRatio(getValidationRatio());
1219	// train all untrained models, and set "data" to the hillclimbing set.
1220	Instances data = m_library.trainAll(trainData, m_workingDirectory.getAbsolutePath(),
1221	m_algorithm);
1222	// We cache the hillclimb predictions from all of the models in
1223	// the library so that we can evaluate their performances when we
1224	// combine them
1225	// in various ways (without needing to keep the classifiers in memory).
1226	double predictions[][][] = m_library.getHillclimbPredictions();
1227	int numModels = predictions.length;
1228	int modelWeights[] = new int[numModels];
1229	m_total_weight = 0;
1230	Random rand = new Random(m_Seed);
1231
1232	if (m_algorithm == ALGORITHM_BUILD_LIBRARY) {
1233	return;
1234
1235	} else if (m_algorithm == ALGORITHM_BEST) {
1236	// If we want to choose the best model, just make a model bag that
1237	// includes all the models, then sort initialize to find the 1 that
1238	// performs best.
1239	ModelBag model_bag = new ModelBag(predictions, 1.0, m_Debug);
1240	int[] modelPicked = model_bag.sortInitialize(1, false, data,
1241	m_hillclimbMetric);
1242	// Then give it a weight of 1, while all others remain 0.
1243	modelWeights[modelPicked[0]] = 1;
1244	} else {
1245
1246	if (m_Debug)
1247	System.out.println("Starting hillclimbing algorithm: "
1248	+ m_algorithm);
1249
1250	for (int i = 0; i < getNumModelBags(); ++i) {
1251	// For the number of bags,
1252	if (m_Debug)
1253	System.out.println("Starting on ensemble bag: " + i);
1254	// Create a new bag of the appropriate size
1255	ModelBag modelBag = new ModelBag(predictions, getModelRatio(),
1256	m_Debug);
1257	// And shuffle it.
1258	modelBag.shuffle(rand);
1259	if (getSortInitializationRatio() > 0.0) {
1260	// Sort initialize, if the ratio greater than 0.
1261	modelBag.sortInitialize((int) (getSortInitializationRatio()
1262	* getModelRatio() * numModels),
1263	getGreedySortInitialization(), data,
1264	m_hillclimbMetric);
1265	}
1266
1267	if (m_algorithm == ALGORITHM_BACKWARD) {
1268	// If we're doing backwards elimination, we just give all
1269	// models
1270	// a weight of 1 initially. If the # of hillclimb iterations
1271	// is too high, we'll end up with just one model in the end
1272	// (we never delete all models from a bag). TODO - it might
1273	// be
1274	// smarter to base this weight off of how many models we
1275	// have.
1276	modelBag.weightAll(1); // for now at least, I'm just
1277	// assuming 1.
1278	}
1279	// Now the bag is initialized, and we're ready to hillclimb.
1280	for (int j = 0; j < getHillclimbIterations(); ++j) {
1281	if (m_algorithm == ALGORITHM_FORWARD) {
1282	modelBag.forwardSelect(getReplacement(), data,
1283	m_hillclimbMetric);
1284	} else if (m_algorithm == ALGORITHM_BACKWARD) {
1285	modelBag.backwardEliminate(data, m_hillclimbMetric);
1286	} else if (m_algorithm == ALGORITHM_FORWARD_BACKWARD) {
1287	modelBag.forwardSelectOrBackwardEliminate(
1288	getReplacement(), data, m_hillclimbMetric);
1289	}
1290	}
1291	// Now that we've done all the hillclimbing steps, we can just
1292	// get
1293	// the model weights that the bag determined, and add them to
1294	// our
1295	// running total.
1296	int[] bagWeights = modelBag.getModelWeights();
1297	for (int j = 0; j < bagWeights.length; ++j) {
1298	modelWeights[j] += bagWeights[j];
1299	}
1300	}
1301	}
1302	// Now we've done the hard work of actually learning the ensemble. Now
1303	// we set up the appropriate data structures so that Ensemble Selection
1304	// can
1305	// make predictions for future test examples.
1306	Set modelNames = m_library.getModelNames();
1307	String[] modelNamesArray = new String[m_library.size()];
1308	Iterator iter = modelNames.iterator();
1309	// libraryIndex indexes over all the models in the library (not just
1310	// those
1311	// which we chose for the ensemble).
1312	int libraryIndex = 0;
1313	// chosenModels will count the total number of models which were
1314	// selected
1315	// by EnsembleSelection (those that have non-zero weight).
1316	int chosenModels = 0;
1317	while (iter.hasNext()) {
1318	// Note that we have to be careful of order. Our model_weights array
1319	// is in the same order as our list of models in m_library.
1320
1321	// Get the name of the model,
1322	modelNamesArray[libraryIndex] = (String) iter.next();
1323	// and its weight.
1324	int weightOfModel = modelWeights[libraryIndex++];
1325	m_total_weight += weightOfModel;
1326	if (weightOfModel > 0) {
1327	// If the model was chosen at least once, increment the
1328	// number of chosen models.
1329	++chosenModels;
1330	}
1331	}
1332	if (m_verboseOutput) {
1333	// Output every model and its performance with respect to the
1334	// validation
1335	// data.
1336	ModelBag bag = new ModelBag(predictions, 1.0, m_Debug);
1337	int modelIndexes[] = bag.sortInitialize(modelNamesArray.length,
1338	false, data, m_hillclimbMetric);
1339	double modelPerformance[] = bag.getIndividualPerformance(data,
1340	m_hillclimbMetric);
1341	for (int i = 0; i < modelIndexes.length; ++i) {
1342	// TODO - Could do this in a more readable way.
1343	System.out.println("" + modelPerformance[i] + " "
1344	+ modelNamesArray[modelIndexes[i]]);
1345	}
1346	}
1347	// We're now ready to build our array of the models which were chosen
1348	// and there associated weights.
1349	m_chosen_models = new EnsembleSelectionLibraryModel[chosenModels];
1350	m_chosen_model_weights = new int[chosenModels];
1351
1352	libraryIndex = 0;
1353	// chosenIndex indexes over the models which were chosen by
1354	// EnsembleSelection
1355	// (those which have non-zero weight).
1356	int chosenIndex = 0;
1357	iter = m_library.getModels().iterator();
1358	while (iter.hasNext()) {
1359	int weightOfModel = modelWeights[libraryIndex++];
1360
1361	EnsembleSelectionLibraryModel model = (EnsembleSelectionLibraryModel) iter
1362	.next();
1363
1364	if (weightOfModel > 0) {
1365	// If the model was chosen at least once, add it to our array
1366	// of chosen models and weights.
1367	m_chosen_models[chosenIndex] = model;
1368	m_chosen_model_weights[chosenIndex] = weightOfModel;
1369	// Note that the EnsembleSelectionLibraryModel may not be
1370	// "loaded" -
1371	// that is, its classifier(s) may be null pointers. That's okay
1372	// -
1373	// we'll "rehydrate" them later, if and when we need to.
1374	++chosenIndex;
1375	}
1376	}
1377	}
1378
1379	/**
1380	* Calculates the class membership probabilities for the given test instance.
1381	*
1382	* @param instance the instance to be classified
1383	* @return predicted class probability distribution
1384	* @throws Exception if instance could not be classified
1385	* successfully
1386	*/
1387	public double[] distributionForInstance(Instance instance) throws Exception {
1388	String stringInstance = instance.toString();
1389	double cachedPreds[][] = null;
1390
1391	if (m_cachedPredictions != null) {
1392	// If we have any cached predictions (i.e., if cachePredictions was
1393	// called), look for a cached set of predictions for this instance.
1394	if (m_cachedPredictions.containsKey(stringInstance)) {
1395	cachedPreds = (double[][]) m_cachedPredictions.get(stringInstance);
1396	}
1397	}
1398	double[] prediction = new double[instance.numClasses()];
1399	for (int i = 0; i < prediction.length; ++i) {
1400	prediction[i] = 0.0;
1401	}
1402
1403	// Now do a weighted average of the predictions of each of our models.
1404	for (int i = 0; i < m_chosen_models.length; ++i) {
1405	double[] predictionForThisModel = null;
1406	if (cachedPreds == null) {
1407	// If there are no predictions cached, we'll load the model's
1408	// classifier(s) in to memory and get the predictions.
1409	m_chosen_models[i].rehydrateModel(m_workingDirectory.getAbsolutePath());
1410	predictionForThisModel = m_chosen_models[i].getAveragePrediction(instance);
1411	// We could release the model here to save memory, but we assume
1412	// that there is enough available since we're not using the
1413	// prediction caching functionality. If we load and release a
1414	// model
1415	// every time we need to get a prediction for an instance, it
1416	// can be
1417	// prohibitively slow.
1418	} else {
1419	// If it's cached, just get it from the array of cached preds
1420	// for this instance.
1421	predictionForThisModel = cachedPreds[i];
1422	}
1423	// We have encountered a bug where MultilayerPerceptron returns a
1424	// null
1425	// prediction array. If that happens, we just don't count that model
1426	// in
1427	// our ensemble prediction.
1428	if (predictionForThisModel != null) {
1429	// Okay, the model returned a valid prediction array, so we'll
1430	// add the appropriate fraction of this model's prediction.
1431	for (int j = 0; j < prediction.length; ++j) {
1432	prediction[j] += m_chosen_model_weights[i] * predictionForThisModel[j] / m_total_weight;
1433	}
1434	}
1435	}
1436	// normalize to add up to 1.
1437	if (instance.classAttribute().isNominal()) {
1438	if (Utils.sum(prediction) > 0)
1439	Utils.normalize(prediction);
1440	}
1441	return prediction;
1442	}
1443
1444	/**
1445	* This function tests whether or not a given path is appropriate for being
1446	* the working directory. Specifically, we care that we can write to the
1447	* path and that it doesn't point to a "non-directory" file handle.
1448	*
1449	* @param dir the directory to test
1450	* @return true if the directory is valid
1451	*/
1452	private boolean validWorkingDirectory(String dir) {
1453
1454	boolean valid = false;
1455
1456	File f = new File((dir));
1457
1458	if (f.exists()) {
1459	if (f.isDirectory() && f.canWrite())
1460	valid = true;
1461	} else {
1462	if (f.canWrite())
1463	valid = true;
1464	}
1465
1466	return valid;
1467
1468	}
1469
1470	/**
1471	* This method tries to find a reasonable path name for the ensemble working
1472	* directory where models and files will be stored.
1473	*
1474	*
1475	* @return true if m_workingDirectory now has a valid file name
1476	*/
1477	public static String getDefaultWorkingDirectory() {
1478
1479	String defaultDirectory = new String("");
1480
1481	boolean success = false;
1482
1483	int i = 1;
1484
1485	while (i < MAX_DEFAULT_DIRECTORIES && !success) {
1486
1487	File f = new File(System.getProperty("user.home"), "Ensemble-" + i);
1488
1489	if (!f.exists() && f.getParentFile().canWrite()) {
1490	defaultDirectory = f.getPath();
1491	success = true;
1492	}
1493	i++;
1494
1495	}
1496
1497	if (!success) {
1498	defaultDirectory = new String("");
1499	// should we print an error or something?
1500	}
1501
1502	return defaultDirectory;
1503	}
1504
1505	/**
1506	* Output a representation of this classifier
1507	*
1508	* @return a string representation of the classifier
1509	*/
1510	public String toString() {
1511	// We just print out the models which were selected, and the number
1512	// of times each was selected.
1513	String result = new String();
1514	if (m_chosen_models != null) {
1515	for (int i = 0; i < m_chosen_models.length; ++i) {
1516	result += m_chosen_model_weights[i];
1517	result += " " + m_chosen_models[i].getStringRepresentation()
1518	+ "\n";
1519	}
1520	} else {
1521	result = "No models selected.";
1522	}
1523	return result;
1524	}
1525
1526	/**
1527	* Cache predictions for the individual base classifiers in the ensemble
1528	* with respect to the given dataset. This is used so that when testing a
1529	* large ensemble on a test set, we don't have to keep the models in memory.
1530	*
1531	* @param test The instances for which to cache predictions.
1532	* @throws Exception if somethng goes wrong
1533	*/
1534	private void cachePredictions(Instances test) throws Exception {
1535	m_cachedPredictions = new HashMap();
1536	Evaluation evalModel = null;
1537	Instances originalInstances = null;
1538	// If the verbose flag is set, we'll also print out the performances of
1539	// all the individual models w.r.t. this test set while we're at it.
1540	boolean printModelPerformances = getVerboseOutput();
1541	if (printModelPerformances) {
1542	// To get performances, we need to keep the class attribute.
1543	originalInstances = new Instances(test);
1544	}
1545
1546	// For each model, we'll go through the dataset and get predictions.
1547	// The idea is we want to only have one model in memory at a time, so
1548	// we'll
1549	// load one model in to memory, get all its predictions, and add them to
1550	// the
1551	// hash map. Then we can release it from memory and move on to the next.
1552	for (int i = 0; i < m_chosen_models.length; ++i) {
1553	if (printModelPerformances) {
1554	// If we're going to print predictions, we need to make a new
1555	// Evaluation object.
1556	evalModel = new Evaluation(originalInstances);
1557	}
1558
1559	Date startTime = new Date();
1560
1561	// Load the model in to memory.
1562	m_chosen_models[i].rehydrateModel(m_workingDirectory.getAbsolutePath());
1563	// Now loop through all the instances and get the model's
1564	// predictions.
1565	for (int j = 0; j < test.numInstances(); ++j) {
1566	Instance currentInstance = test.instance(j);
1567	// When we're looking for a cached prediction later, we'll only
1568	// have the non-class attributes, so we set the class missing
1569	// here
1570	// in order to make the string match up properly.
1571	currentInstance.setClassMissing();
1572	String stringInstance = currentInstance.toString();
1573
1574	// When we come in here with the first model, the instance will
1575	// not
1576	// yet be part of the map.
1577	if (!m_cachedPredictions.containsKey(stringInstance)) {
1578	// The instance isn't in the map yet, so add it.
1579	// For each instance, we store a two-dimensional array - the
1580	// first
1581	// index is over all the models in the ensemble, and the
1582	// second
1583	// index is over the (i.e., typical prediction array).
1584	int predSize = test.classAttribute().isNumeric() ? 1 : test
1585	.classAttribute().numValues();
1586	double predictionArray[][] = new double[m_chosen_models.length][predSize];
1587	m_cachedPredictions.put(stringInstance, predictionArray);
1588	}
1589	// Get the array from the map which is associated with this
1590	// instance
1591	double predictions[][] = (double[][]) m_cachedPredictions
1592	.get(stringInstance);
1593	// And add our model's prediction for it.
1594	predictions[i] = m_chosen_models[i].getAveragePrediction(test
1595	.instance(j));
1596
1597	if (printModelPerformances) {
1598	evalModel.evaluateModelOnceAndRecordPrediction(
1599	predictions[i], originalInstances.instance(j));
1600	}
1601	}
1602	// Now we're done with model #i, so we can release it.
1603	m_chosen_models[i].releaseModel();
1604
1605	Date endTime = new Date();
1606	long diff = endTime.getTime() - startTime.getTime();
1607
1608	if (m_Debug)
1609	System.out.println("Test time for "
1610	+ m_chosen_models[i].getStringRepresentation()
1611	+ " was: " + diff);
1612
1613	if (printModelPerformances) {
1614	String output = new String(m_chosen_models[i]
1615	.getStringRepresentation()
1616	+ ": ");
1617	output += "\tRMSE:" + evalModel.rootMeanSquaredError();
1618	output += "\tACC:" + evalModel.pctCorrect();
1619	if (test.numClasses() == 2) {
1620	// For multiclass problems, we could print these too, but
1621	// it's
1622	// not clear which class we should use in that case... so
1623	// instead
1624	// we only print these metrics for binary classification
1625	// problems.
1626	output += "\tROC:" + evalModel.areaUnderROC(1);
1627	output += "\tPREC:" + evalModel.precision(1);
1628	output += "\tFSCR:" + evalModel.fMeasure(1);
1629	}
1630	System.out.println(output);
1631	}
1632	}
1633	}
1634
1635	/**
1636	* Return the technical information. There is actually another
1637	* paper that describes our current method of CV for this classifier
1638	* TODO: Cite Technical report when published
1639	*
1640	* @return the technical information about this class
1641	*/
1642	public TechnicalInformation getTechnicalInformation() {
1643
1644	TechnicalInformation result;
1645
1646	result = new TechnicalInformation(Type.INPROCEEDINGS);
1647	result.setValue(Field.AUTHOR, "Rich Caruana, Alex Niculescu, Geoff Crew, and Alex Ksikes");
1648	result.setValue(Field.TITLE, "Ensemble Selection from Libraries of Models");
1649	result.setValue(Field.BOOKTITLE, "21st International Conference on Machine Learning");
1650	result.setValue(Field.YEAR, "2004");
1651
1652	return result;
1653	}
1654
1655	/**
1656	* Returns the revision string.
1657	*
1658	* @return the revision
1659	*/
1660	public String getRevision() {
1661	return RevisionUtils.extract("$Revision: 5480 $");
1662	}
1663
1664	/**
1665	* Executes the classifier from commandline.
1666	*
1667	* @param argv
1668	* should contain the following arguments: -t training file [-T
1669	* test file] [-c class index]
1670	*/
1671	public static void main(String[] argv) {
1672
1673	try {
1674
1675	String options[] = (String[]) argv.clone();
1676
1677	// do we get the input from XML instead of normal parameters?
1678	String xml = Utils.getOption("xml", options);
1679	if (!xml.equals(""))
1680	options = new XMLOptions(xml).toArray();
1681
1682	String trainFileName = Utils.getOption('t', options);
1683	String objectInputFileName = Utils.getOption('l', options);
1684	String testFileName = Utils.getOption('T', options);
1685
1686	if (testFileName.length() != 0 && objectInputFileName.length() != 0
1687	&& trainFileName.length() == 0) {
1688
1689	System.out.println("Caching predictions");
1690
1691	EnsembleSelection classifier = null;
1692
1693	BufferedReader testReader = new BufferedReader(new FileReader(
1694	testFileName));
1695
1696	// Set up the Instances Object
1697	Instances test;
1698	int classIndex = -1;
1699	String classIndexString = Utils.getOption('c', options);
1700	if (classIndexString.length() != 0) {
1701	classIndex = Integer.parseInt(classIndexString);
1702	}
1703
1704	test = new Instances(testReader, 1);
1705	if (classIndex != -1) {
1706	test.setClassIndex(classIndex - 1);
1707	} else {
1708	test.setClassIndex(test.numAttributes() - 1);
1709	}
1710	if (classIndex > test.numAttributes()) {
1711	throw new Exception("Index of class attribute too large.");
1712	}
1713
1714	while (test.readInstance(testReader)) {
1715
1716	}
1717	testReader.close();
1718
1719	// Now yoink the EnsembleSelection Object from the fileSystem
1720
1721	InputStream is = new FileInputStream(objectInputFileName);
1722	if (objectInputFileName.endsWith(".gz")) {
1723	is = new GZIPInputStream(is);
1724	}
1725
1726	// load from KOML?
1727	if (!(objectInputFileName.endsWith("UpdateableClassifier.koml") && KOML
1728	.isPresent())) {
1729	ObjectInputStream objectInputStream = new ObjectInputStream(
1730	is);
1731	classifier = (EnsembleSelection) objectInputStream
1732	.readObject();
1733	objectInputStream.close();
1734	} else {
1735	BufferedInputStream xmlInputStream = new BufferedInputStream(
1736	is);
1737	classifier = (EnsembleSelection) KOML.read(xmlInputStream);
1738	xmlInputStream.close();
1739	}
1740
1741	String workingDir = Utils.getOption('W', argv);
1742	if (!workingDir.equals("")) {
1743	classifier.setWorkingDirectory(new File(workingDir));
1744	}
1745
1746	classifier.setDebug(Utils.getFlag('D', argv));
1747	classifier.setVerboseOutput(Utils.getFlag('O', argv));
1748
1749	classifier.cachePredictions(test);
1750
1751	// Now we write the model back out to the file system.
1752	String objectOutputFileName = objectInputFileName;
1753	OutputStream os = new FileOutputStream(objectOutputFileName);
1754	// binary
1755	if (!(objectOutputFileName.endsWith(".xml") \|\| (objectOutputFileName
1756	.endsWith(".koml") && KOML.isPresent()))) {
1757	if (objectOutputFileName.endsWith(".gz")) {
1758	os = new GZIPOutputStream(os);
1759	}
1760	ObjectOutputStream objectOutputStream = new ObjectOutputStream(
1761	os);
1762	objectOutputStream.writeObject(classifier);
1763	objectOutputStream.flush();
1764	objectOutputStream.close();
1765	}
1766	// KOML/XML
1767	else {
1768	BufferedOutputStream xmlOutputStream = new BufferedOutputStream(
1769	os);
1770	if (objectOutputFileName.endsWith(".xml")) {
1771	XMLSerialization xmlSerial = new XMLClassifier();
1772	xmlSerial.write(xmlOutputStream, classifier);
1773	} else
1774	// whether KOML is present has already been checked
1775	// if not present -> ".koml" is interpreted as binary - see
1776	// above
1777	if (objectOutputFileName.endsWith(".koml")) {
1778	KOML.write(xmlOutputStream, classifier);
1779	}
1780	xmlOutputStream.close();
1781	}
1782
1783	}
1784
1785	System.out.println(Evaluation.evaluateModel(
1786	new EnsembleSelection(), argv));
1787
1788	} catch (Exception e) {
1789	if ( (e.getMessage() != null)
1790	&& (e.getMessage().indexOf("General options") == -1) )
1791	e.printStackTrace();
1792	else
1793	System.err.println(e.getMessage());
1794	}
1795	}
1796	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: