/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* LinearForwardSelection.java
* Copyright (C) 2007 Martin Guetlein
*
*/
package weka.attributeSelection;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
/**
* LinearForwardSelection:
*
* Extension of BestFirst. Takes a restricted number of k attributes into account. Fixed-set selects a fixed number k of attributes, whereas k is increased in each step when fixed-width is selected. The search uses either the initial ordering to select the top k attributes, or performs a ranking (with the same evalutator the search uses later on). The search direction can be forward, or floating forward selection (with opitional backward search steps).
*
* For more information see:
*
* Martin Guetlein, Eibe Frank, Mark Hall, Andreas Karwath: Large Scale Attribute Selection Using Wrappers. In: Proc IEEE Symposium on Computational Intelligence and Data Mining, 332-339, 2009.
*
* Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany.
*
-P <start set> * Specify a starting set of attributes. * Eg. 1,3,5-7.* *
-D <0 = forward selection | 1 = floating forward selection> * Forward selection method. (default = 0).* *
-N <num> * Number of non-improving nodes to * consider before terminating search.* *
-I * Perform initial ranking to select the * top-ranked attributes.* *
-K <num> * Number of top-ranked attributes that are * taken into account by the search.* *
-T <0 = fixed-set | 1 = fixed-width> * Type of Linear Forward Selection (default = 0).* *
-S <num> * Size of lookup cache for evaluated subsets. * Expressed as a multiple of the number of * attributes in the data set. (default = 1)* *
-Z * verbose on/off* * * @author Martin Guetlein (martin.guetlein@gmail.com) * @version $Revision: 6160 $ */ public class LinearForwardSelection extends ASSearch implements OptionHandler, StartSetHandler, TechnicalInformationHandler { /** search directions */ protected static final int SEARCH_METHOD_FORWARD = 0; protected static final int SEARCH_METHOD_FLOATING = 1; public static final Tag[] TAGS_SEARCH_METHOD = { new Tag(SEARCH_METHOD_FORWARD, "Forward selection"), new Tag(SEARCH_METHOD_FLOATING, "Floating forward selection"), }; /** search directions */ protected static final int TYPE_FIXED_SET = 0; protected static final int TYPE_FIXED_WIDTH = 1; public static final Tag[] TAGS_TYPE = { new Tag(TYPE_FIXED_SET, "Fixed-set"), new Tag(TYPE_FIXED_WIDTH, "Fixed-width"), }; // member variables /** maximum number of stale nodes before terminating search */ protected int m_maxStale; /** 0 == forward selection, 1 == floating forward search */ protected int m_forwardSearchMethod; /** perform initial ranking to select top-ranked attributes */ protected boolean m_performRanking; /** * number of top-ranked attributes that are taken into account for the * search */ protected int m_numUsedAttributes; /** 0 == fixed-set, 1 == fixed-width */ protected int m_linearSelectionType; /** holds an array of starting attributes */ protected int[] m_starting; /** holds the start set for the search as a Range */ protected Range m_startRange; /** does the data have a class */ protected boolean m_hasClass; /** holds the class index */ protected int m_classIndex; /** number of attributes in the data */ protected int m_numAttribs; /** total number of subsets evaluated during a search */ protected int m_totalEvals; /** for debugging */ protected boolean m_verbose; /** holds the merit of the best subset found */ protected double m_bestMerit; /** holds the maximum size of the lookup cache for evaluated subsets */ protected int m_cacheSize; /** * Constructor */ public LinearForwardSelection() { resetOptions(); } /** * Returns a string describing this search method * * @return a description of the search method suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "LinearForwardSelection:\n\n" + "Extension of BestFirst. Takes a restricted number of k attributes " + "into account. Fixed-set selects a fixed number k of attributes, " + "whereas k is increased in each step when fixed-width is selected. " + "The search uses either the initial ordering to select the " + "top k attributes, or performs a ranking (with the same evalutator the " + "search uses later on). The search direction can be forward, " + "or floating forward selection (with opitional backward search steps).\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall and Andreas Karwath"); result.setValue(Field.YEAR, "2009"); result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining"); result.setValue(Field.PAGES, "332-339"); result.setValue(Field.PUBLISHER, "IEEE"); additional = result.add(Type.MASTERSTHESIS); additional.setValue(Field.AUTHOR, "Martin Guetlein"); additional.setValue(Field.YEAR, "2006"); additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers"); additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet"); additional.setValue(Field.ADDRESS, "Freiburg, Germany"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. * */ public Enumeration listOptions() { Vector newVector = new Vector(8); newVector.addElement(new Option("\tSpecify a starting set of attributes." + "\n\tEg. 1,3,5-7.", "P", 1, "-P
*
* -P
*
* -D <0 = forward selection | 1 = floating forward selection>
*
* -N
*
* -I
*
* -K
*
* -T <0 = fixed-set | 1 = fixed-width>
*
* -S
*
* -Z
*
* @param options
* the list of options as an array of strings
* @exception Exception
* if an option is not supported
*
*/
public void setOptions(String[] options) throws Exception {
String optionString;
resetOptions();
optionString = Utils.getOption('P', options);
if (optionString.length() != 0) {
setStartSet(optionString);
}
optionString = Utils.getOption('D', options);
if (optionString.length() != 0) {
setForwardSelectionMethod(new SelectedTag(Integer.parseInt(optionString),
TAGS_SEARCH_METHOD));
} else {
setForwardSelectionMethod(new SelectedTag(SEARCH_METHOD_FORWARD,
TAGS_SEARCH_METHOD));
}
optionString = Utils.getOption('N', options);
if (optionString.length() != 0) {
setSearchTermination(Integer.parseInt(optionString));
}
setPerformRanking(Utils.getFlag('I', options));
optionString = Utils.getOption('K', options);
if (optionString.length() != 0) {
setNumUsedAttributes(Integer.parseInt(optionString));
}
optionString = Utils.getOption('T', options);
if (optionString.length() != 0) {
setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE));
} else {
setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE));
}
optionString = Utils.getOption('S', options);
if (optionString.length() != 0) {
setLookupCacheSize(Integer.parseInt(optionString));
}
m_verbose = Utils.getFlag('Z', options);
}
/**
* Set the maximum size of the evaluated subset cache (hashtable). This is
* expressed as a multiplier for the number of attributes in the data set.
* (default = 1).
*
* @param size
* the maximum size of the hashtable
*/
public void setLookupCacheSize(int size) {
if (size >= 0) {
m_cacheSize = size;
}
}
/**
* Return the maximum size of the evaluated subset cache (expressed as a
* multiplier for the number of attributes in a data set.
*
* @return the maximum size of the hashtable.
*/
public int getLookupCacheSize() {
return m_cacheSize;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String lookupCacheSizeTipText() {
return "Set the maximum size of the lookup cache of evaluated subsets. This is " +
"expressed as a multiplier of the number of attributes in the data set. " +
"(default = 1).";
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String startSetTipText() {
return "Set the start point for the search. This is specified as a comma " +
"seperated list off attribute indexes starting at 1. It can include " +
"ranges. Eg. 1,2,5-9,17.";
}
/**
* Sets a starting set of attributes for the search. It is the search
* method's responsibility to report this start set (if any) in its
* toString() method.
*
* @param startSet
* a string containing a list of attributes (and or ranges), eg.
* 1,2,6,10-15.
* @exception Exception
* if start set can't be set.
*/
public void setStartSet(String startSet) throws Exception {
m_startRange.setRanges(startSet);
}
/**
* Returns a list of attributes (and or attribute ranges) as a String
*
* @return a list of attributes (and or attribute ranges)
*/
public String getStartSet() {
return m_startRange.getRanges();
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String searchTerminationTipText() {
return "Set the amount of backtracking. Specify the number of ";
}
/**
* Set the numnber of non-improving nodes to consider before terminating
* search.
*
* @param t
* the number of non-improving nodes
* @exception Exception
* if t is less than 1
*/
public void setSearchTermination(int t) throws Exception {
if (t < 1) {
throw new Exception("Value of -N must be > 0.");
}
m_maxStale = t;
}
/**
* Get the termination criterion (number of non-improving nodes).
*
* @return the number of non-improving nodes
*/
public int getSearchTermination() {
return m_maxStale;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String performRankingTipText() {
return "Perform initial ranking to select top-ranked attributes.";
}
/**
* Perform initial ranking to select top-ranked attributes.
*
* @param b
* true if initial ranking should be performed
*/
public void setPerformRanking(boolean b) {
m_performRanking = b;
}
/**
* Get boolean if initial ranking should be performed to select the
* top-ranked attributes
*
* @return true if initial ranking should be performed
*/
public boolean getPerformRanking() {
return m_performRanking;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String numUsedAttributesTipText() {
return "Set the amount of top-ranked attributes that are taken into account by the search process.";
}
/**
* Set the number of top-ranked attributes that taken into account by the
* search process.
*
* @param k
* the number of attributes
* @exception Exception
* if k is less than 2
*/
public void setNumUsedAttributes(int k) throws Exception {
if (k < 2) {
throw new Exception("Value of -K must be >= 2.");
}
m_numUsedAttributes = k;
}
/**
* Get the number of top-ranked attributes that taken into account by the
* search process.
*
* @return the number of top-ranked attributes that taken into account
*/
public int getNumUsedAttributes() {
return m_numUsedAttributes;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String forwardSelectionMethodTipText() {
return "Set the direction of the search.";
}
/**
* Set the search direction
*
* @param d
* the direction of the search
*/
public void setForwardSelectionMethod(SelectedTag d) {
if (d.getTags() == TAGS_SEARCH_METHOD) {
m_forwardSearchMethod = d.getSelectedTag().getID();
}
}
/**
* Get the search direction
*
* @return the direction of the search
*/
public SelectedTag getForwardSelectionMethod() {
return new SelectedTag(m_forwardSearchMethod, TAGS_SEARCH_METHOD);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String typeTipText() {
return "Set the type of the search.";
}
/**
* Set the type
*
* @param t
* the Linear Forward Selection type
*/
public void setType(SelectedTag t) {
if (t.getTags() == TAGS_TYPE) {
m_linearSelectionType = t.getSelectedTag().getID();
}
}
/**
* Get the type
*
* @return the Linear Forward Selection type
*/
public SelectedTag getType() {
return new SelectedTag(m_linearSelectionType, TAGS_TYPE);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String verboseTipText() {
return "Turn on verbose output for monitoring the search's progress.";
}
/**
* Set whether verbose output should be generated.
*
* @param b
* true if output is to be verbose.
*/
public void setVerbose(boolean b) {
m_verbose = b;
}
/**
* Get whether output is to be verbose
*
* @return true if output will be verbose
*/
public boolean getVerbose() {
return m_verbose;
}
/**
* Gets the current settings of LinearForwardSelection.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions() {
String[] options = new String[13];
int current = 0;
if (!(getStartSet().equals(""))) {
options[current++] = "-P";
options[current++] = "" + startSetToString();
}
options[current++] = "-D";
options[current++] = "" + m_forwardSearchMethod;
options[current++] = "-N";
options[current++] = "" + m_maxStale;
if (m_performRanking) {
options[current++] = "-I";
}
options[current++] = "-K";
options[current++] = "" + m_numUsedAttributes;
options[current++] = "-T";
options[current++] = "" + m_linearSelectionType;
if (m_verbose)
options[current++] = "-Z";
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* converts the array of starting attributes to a string. This is used by
* getOptions to return the actual attributes specified as the starting set.
* This is better than using m_startRanges.getRanges() as the same start set
* can be specified in different ways from the command line---eg 1,2,3 ==
* 1-3. This is to ensure that stuff that is stored in a database is
* comparable.
*
* @return a comma seperated list of individual attribute numbers as a
* String
*/
private String startSetToString() {
StringBuffer FString = new StringBuffer();
boolean didPrint;
if (m_starting == null) {
return getStartSet();
}
for (int i = 0; i < m_starting.length; i++) {
didPrint = false;
if ((m_hasClass == false) ||
((m_hasClass == true) && (i != m_classIndex))) {
FString.append((m_starting[i] + 1));
didPrint = true;
}
if (i == (m_starting.length - 1)) {
FString.append("");
} else {
if (didPrint) {
FString.append(",");
}
}
}
return FString.toString();
}
/**
* returns a description of the search as a String
*
* @return a description of the search
*/
public String toString() {
StringBuffer LFSString = new StringBuffer();
LFSString.append("\tLinear Forward Selection.\n\tStart set: ");
if (m_starting == null) {
LFSString.append("no attributes\n");
} else {
LFSString.append(startSetToString() + "\n");
}
LFSString.append("\tForward selection method: ");
if (m_forwardSearchMethod == SEARCH_METHOD_FORWARD) {
LFSString.append("forward selection\n");
} else {
LFSString.append("floating forward selection\n");
}
LFSString.append("\tStale search after " + m_maxStale +
" node expansions\n");
LFSString.append("\tLinear Forward Selection Type: ");
if (m_linearSelectionType == TYPE_FIXED_SET) {
LFSString.append("fixed-set\n");
} else {
LFSString.append("fixed-width\n");
}
LFSString.append("\tNumber of top-ranked attributes that are used: " +
m_numUsedAttributes + "\n");
LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals +
"\n");
LFSString.append("\tMerit of best subset found: " +
Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n");
return LFSString.toString();
}
/**
* Searches the attribute subset space by linear forward selection
*
* @param ASEval
* the attribute evaluator to guide the search
* @param data
* the training instances.
* @return an array (not necessarily ordered) of selected attribute indexes
* @exception Exception
* if the search can't be completed
*/
public int[] search(ASEvaluation ASEval, Instances data)
throws Exception {
m_totalEvals = 0;
if (!(ASEval instanceof SubsetEvaluator)) {
throw new Exception(ASEval.getClass().getName() + " is not a " +
"Subset evaluator!");
}
if (ASEval instanceof UnsupervisedSubsetEvaluator) {
m_hasClass = false;
} else {
m_hasClass = true;
m_classIndex = data.classIndex();
}
((ASEvaluation) ASEval).buildEvaluator(data);
m_numAttribs = data.numAttributes();
if (m_numUsedAttributes > m_numAttribs) {
System.out.println(
"Decreasing number of top-ranked attributes to total number of attributes: " +
data.numAttributes());
m_numUsedAttributes = m_numAttribs;
}
BitSet start_group = new BitSet(m_numAttribs);
m_startRange.setUpper(m_numAttribs - 1);
if (!(getStartSet().equals(""))) {
m_starting = m_startRange.getSelection();
}
// If a starting subset has been supplied, then initialise the bitset
if (m_starting != null) {
for (int i = 0; i < m_starting.length; i++) {
if ((m_starting[i]) != m_classIndex) {
start_group.set(m_starting[i]);
}
}
}
LFSMethods LFS = new LFSMethods();
int[] ranking;
if (m_performRanking) {
ranking = LFS.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose);
} else {
ranking = new int[m_numAttribs];
for (int i = 0; i < ranking.length; i++) {
ranking[i] = i;
}
}
if (m_forwardSearchMethod == SEARCH_METHOD_FORWARD) {
LFS.forwardSearch(m_cacheSize, start_group, ranking, m_numUsedAttributes,
m_linearSelectionType == TYPE_FIXED_WIDTH, m_maxStale, -1, data,
(SubsetEvaluator) ASEval, m_verbose);
} else if (m_forwardSearchMethod == SEARCH_METHOD_FLOATING) {
LFS.floatingForwardSearch(m_cacheSize, start_group, ranking,
m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH,
m_maxStale, data, (SubsetEvaluator) ASEval, m_verbose);
}
m_totalEvals = LFS.getNumEvalsTotal();
m_bestMerit = LFS.getBestMerit();
return attributeList(LFS.getBestGroup());
}
/**
* Reset options to default values
*/
protected void resetOptions() {
m_maxStale = 5;
m_forwardSearchMethod = SEARCH_METHOD_FORWARD;
m_performRanking = true;
m_numUsedAttributes = 50;
m_linearSelectionType = TYPE_FIXED_SET;
m_starting = null;
m_startRange = new Range();
m_classIndex = -1;
m_totalEvals = 0;
m_cacheSize = 1;
m_verbose = false;
}
/**
* converts a BitSet into a list of attribute indexes
*
* @param group
* the BitSet to convert
* @return an array of attribute indexes
*/
protected int[] attributeList(BitSet group) {
int count = 0;
// count how many were selected
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
count++;
}
}
int[] list = new int[count];
count = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
list[count++] = i;
}
}
return list;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 6160 $");
}
}
* Specify a starting set of attributes. Eg 1,4,7-9.
*
* Forward selection method of the search. (default = 0).
*
* Number of non improving nodes to consider before terminating search.
* (default = 5).
*
* Perform initial ranking to select top-ranked attributes.
*
* Number of top-ranked attributes that are taken into account.
*
* Typ of Linear Forward Selection (default = 0).
*
* Size of lookup cache for evaluated subsets. Expressed as a multiple of
* the number of attributes in the data set. (default = 1).
*
* verbose on/off.
*