/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * AttributeSelection.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */ package weka.attributeSelection; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; import java.beans.BeanInfo; import java.beans.IntrospectionException; import java.beans.Introspector; import java.beans.MethodDescriptor; import java.beans.PropertyDescriptor; import java.io.Serializable; import java.lang.reflect.Method; import java.util.Enumeration; import java.util.Random; /** * Attribute selection class. Takes the name of a search class and * an evaluation class on the command line.

* * Valid options are:

* * -h
* Display help.

* * -i <name of input file>
* Specify the training data file.

* * -c <class index>
* The index of the attribute to use as the class.

* * -s <search method>
* The full class name of the search method followed by search method options * (if any).
* Eg. -s "weka.attributeSelection.BestFirst -N 10"

* * -x <number of folds>
* Perform a cross validation.

* * -n <random number seed>
* Specify a random number seed. Use in conjuction with -x. (Default = 1).

* * ------------------------------------------------------------------------

* * Example usage as the main of an attribute evaluator (called FunkyEvaluator): *

 * public static void main(String [] args) {
 *   runEvaluator(new FunkyEvaluator(), args);
 * }
 * 
*

* * ------------------------------------------------------------------------

* * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.47 $ */ public class AttributeSelection implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = 4170171824147584330L; /** the instances to select attributes from */ private Instances m_trainInstances; /** the attribute/subset evaluator */ private ASEvaluation m_ASEvaluator; /** the search method */ private ASSearch m_searchMethod; /** the number of folds to use for cross validation */ private int m_numFolds; /** holds a string describing the results of the attribute selection */ private StringBuffer m_selectionResults; /** rank features (if allowed by the search method) */ private boolean m_doRank; /** do cross validation */ private boolean m_doXval; /** seed used to randomly shuffle instances for cross validation */ private int m_seed; /** number of attributes requested from ranked results */ private int m_numToSelect; /** the selected attributes */ private int [] m_selectedAttributeSet; /** the attribute indexes and associated merits if a ranking is produced */ private double [][] m_attributeRanking; /** if a feature selection run involves an attribute transformer */ private AttributeTransformer m_transformer = null; /** the attribute filter for processing instances with respect to the most recent feature selection run */ private Remove m_attributeFilter = null; /** hold statistics for repeated feature selection, such as under cross validation */ private double [][] m_rankResults = null; private double [] m_subsetResults = null; private int m_trials = 0; /** * Return the number of attributes selected from the most recent * run of attribute selection * @return the number of attributes selected */ public int numberAttributesSelected() throws Exception { int [] att = selectedAttributes(); return att.length-1; } /** * get the final selected set of attributes. * @return an array of attribute indexes * @exception Exception if attribute selection has not been performed yet */ public int [] selectedAttributes () throws Exception { if (m_selectedAttributeSet == null) { throw new Exception("Attribute selection has not been performed yet!"); } return m_selectedAttributeSet; } /** * get the final ranking of the attributes. * @return a two dimensional array of ranked attribute indexes and their * associated merit scores as doubles. * @exception Exception if a ranking has not been produced */ public double [][] rankedAttributes () throws Exception { if (m_attributeRanking == null) { throw new Exception("Ranking has not been performed"); } return m_attributeRanking; } /** * set the attribute/subset evaluator * @param evaluator the evaluator to use */ public void setEvaluator (ASEvaluation evaluator) { m_ASEvaluator = evaluator; } /** * set the search method * @param search the search method to use */ public void setSearch (ASSearch search) { m_searchMethod = search; if (m_searchMethod instanceof RankedOutputSearch) { setRanking(((RankedOutputSearch)m_searchMethod).getGenerateRanking()); } } /** * set the number of folds for cross validation * @param folds the number of folds */ public void setFolds (int folds) { m_numFolds = folds; } /** * produce a ranking (if possible with the set search and evaluator) * @param r true if a ranking is to be produced */ public void setRanking (boolean r) { m_doRank = r; } /** * do a cross validation * @param x true if a cross validation is to be performed */ public void setXval (boolean x) { m_doXval = x; } /** * set the seed for use in cross validation * @param s the seed */ public void setSeed (int s) { m_seed = s; } /** * get a description of the attribute selection * @return a String describing the results of attribute selection */ public String toResultsString() { return m_selectionResults.toString(); } /** * reduce the dimensionality of a set of instances to include only those * attributes chosen by the last run of attribute selection. * @param in the instances to be reduced * @return a dimensionality reduced set of instances * @exception Exception if the instances can't be reduced */ public Instances reduceDimensionality(Instances in) throws Exception { if (m_attributeFilter == null) { throw new Exception("No feature selection has been performed yet!"); } if (m_transformer != null) { Instances transformed = new Instances(m_transformer.transformedHeader(), in.numInstances()); for (int i=0;i 0) { CvString.append(Utils.doubleToString(Math. abs(m_rankResults[0][s[i]]), 6, 3) + " +-" + Utils.doubleToString(m_rankResults[2][s[i]], 6, 3) + " " + Utils.doubleToString(m_rankResults[1][s[i]], fieldWidth+2, 1) + " +-" + Utils.doubleToString(m_rankResults[3][s[i]], 5, 2) +" " + Utils.doubleToString(((double)(s[i] + 1)), fieldWidth, 0) + " " + m_trainInstances.attribute(s[i]).name() + "\n"); } } } else { CvString.append("number of folds (%) attribute\n"); for (int i = 0; i < m_subsetResults.length; i++) { if ((m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) || (i != m_trainInstances.classIndex())) { CvString.append(Utils.doubleToString(m_subsetResults[i], 12, 0) + "(" + Utils.doubleToString((m_subsetResults[i] / m_numFolds * 100.0) , 3, 0) + " %) " + Utils.doubleToString(((double)(i + 1)), fieldWidth, 0) + " " + m_trainInstances.attribute(i).name() + "\n"); } } } return CvString.toString(); } /** * Select attributes for a split of the data. Calling this function * updates the statistics on attribute selection. CVResultsString() * returns a string summarizing the results of repeated calls to * this function. Assumes that splits are from the same dataset--- * ie. have the same number and types of attributes as previous * splits. * * @param split the instances to select attributes from * @exception Exception if an error occurs */ public void selectAttributesCVSplit(Instances split) throws Exception { double[][] attributeRanking = null; // if the train instances are null then set equal to this split. // If this is the case then this function is more than likely being // called from outside this class in order to obtain CV statistics // and all we need m_trainIstances for is to get at attribute names // and types etc. if (m_trainInstances == null) { m_trainInstances = split; } // create space to hold statistics if (m_rankResults == null && m_subsetResults == null) { m_subsetResults = new double[split.numAttributes()]; m_rankResults = new double[4][split.numAttributes()]; } m_ASEvaluator.buildEvaluator(split); // Do the search int[] attributeSet = m_searchMethod.search(m_ASEvaluator, split); // Do any postprocessing that a attribute selection method might // require attributeSet = m_ASEvaluator.postProcess(attributeSet); if ((m_searchMethod instanceof RankedOutputSearch) && (m_doRank == true)) { attributeRanking = ((RankedOutputSearch)m_searchMethod). rankedAttributes(); // System.out.println(attributeRanking[0][1]); for (int j = 0; j < attributeRanking.length; j++) { // merit m_rankResults[0][(int)attributeRanking[j][0]] += attributeRanking[j][1]; // squared merit m_rankResults[2][(int)attributeRanking[j][0]] += (attributeRanking[j][1]*attributeRanking[j][1]); // rank m_rankResults[1][(int)attributeRanking[j][0]] += (j + 1); // squared rank m_rankResults[3][(int)attributeRanking[j][0]] += (j + 1)*(j + 1); // += (attributeRanking[j][0] * attributeRanking[j][0]); } } else { for (int j = 0; j < attributeSet.length; j++) { m_subsetResults[attributeSet[j]]++; } } m_trials++; } /** * Perform a cross validation for attribute selection. With subset * evaluators the number of times each attribute is selected over * the cross validation is reported. For attribute evaluators, the * average merit and average ranking + std deviation is reported for * each attribute. * * @return the results of cross validation as a String * @exception Exception if an error occurs during cross validation */ public String CrossValidateAttributes () throws Exception { Instances cvData = new Instances(m_trainInstances); Instances train; Random random = new Random(m_seed); cvData.randomize(random); if (!(m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) && !(m_ASEvaluator instanceof UnsupervisedAttributeEvaluator)) { if (cvData.classAttribute().isNominal()) { cvData.stratify(m_numFolds); } } for (int i = 0; i < m_numFolds; i++) { // Perform attribute selection train = cvData.trainCV(m_numFolds, i, random); selectAttributesCVSplit(train); } return CVResultsString(); } /** * Perform attribute selection on the supplied training instances. * * @param data the instances to select attributes from * @exception Exception if there is a problem during selection */ public void SelectAttributes (Instances data) throws Exception { int [] attributeSet; m_transformer = null; m_attributeFilter = null; m_trainInstances = data; if (m_doXval == true && (m_ASEvaluator instanceof AttributeTransformer)) { throw new Exception("Can't cross validate an attribute transformer."); } if (m_ASEvaluator instanceof SubsetEvaluator && m_searchMethod instanceof Ranker) { throw new Exception(m_ASEvaluator.getClass().getName() +" must use a search method other than Ranker"); } if (m_ASEvaluator instanceof AttributeEvaluator && !(m_searchMethod instanceof Ranker)) { // System.err.println("AttributeEvaluators must use a Ranker search " // +"method. Switching to Ranker..."); // m_searchMethod = new Ranker(); throw new Exception("AttributeEvaluators must use the Ranker search " + "method"); } if (m_searchMethod instanceof RankedOutputSearch) { m_doRank = ((RankedOutputSearch)m_searchMethod).getGenerateRanking(); } if (m_ASEvaluator instanceof UnsupervisedAttributeEvaluator || m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) { // unset the class index // m_trainInstances.setClassIndex(-1); } else { // check that a class index has been set if (m_trainInstances.classIndex() < 0) { m_trainInstances.setClassIndex(m_trainInstances.numAttributes()-1); } } // Initialize the attribute evaluator m_ASEvaluator.buildEvaluator(m_trainInstances); if (m_ASEvaluator instanceof AttributeTransformer) { m_trainInstances = ((AttributeTransformer)m_ASEvaluator).transformedHeader(); m_transformer = (AttributeTransformer)m_ASEvaluator; } int fieldWidth = (int)(Math.log(m_trainInstances.numAttributes()) +1.0); // Do the search attributeSet = m_searchMethod.search(m_ASEvaluator, m_trainInstances); // try and determine if the search method uses an attribute transformer--- // this is a bit of a hack to make things work properly with RankSearch // using PrincipalComponents as its attribute ranker try { BeanInfo bi = Introspector.getBeanInfo(m_searchMethod.getClass()); PropertyDescriptor properties[]; MethodDescriptor methods[]; // methods = bi.getMethodDescriptors(); properties = bi.getPropertyDescriptors(); for (int i=0;i 0) { precision = Math.abs((Math.log(Math.abs(precision)) / Math.log(10)))+3; } if (precision > f_p) { f_p = (int)precision; } if (intPart == 0) { if (w_p < 2) { w_p = 2; } } else if ((Math.abs((Math.log(Math.abs(m_attributeRanking[i][1])) / Math.log(10)))+1) > w_p) { if (m_attributeRanking[i][1] > 0) { w_p = (int)Math.abs((Math.log(Math.abs(m_attributeRanking[i][1])) / Math.log(10)))+1; } } } for (int i = 0; i < m_numToSelect; i++) { m_selectionResults. append(Utils.doubleToString(m_attributeRanking[i][1], f_p+w_p+1,f_p) + Utils.doubleToString((m_attributeRanking[i][0] + 1), fieldWidth+1,0) + " " + m_trainInstances. attribute((int)m_attributeRanking[i][0]).name() + "\n"); } // set up the selected attributes array - usable by a filter or // whatever if (m_trainInstances.classIndex() >= 0) { if ((!(m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) && !(m_ASEvaluator instanceof UnsupervisedAttributeEvaluator)) || m_ASEvaluator instanceof AttributeTransformer) { // one more for the class m_selectedAttributeSet = new int[m_numToSelect + 1]; m_selectedAttributeSet[m_numToSelect] = m_trainInstances.classIndex(); } else { m_selectedAttributeSet = new int[m_numToSelect]; } } else { m_selectedAttributeSet = new int[m_numToSelect]; } m_selectionResults.append("\nSelected attributes: "); for (int i = 0; i < m_numToSelect; i++) { m_selectedAttributeSet[i] = (int)m_attributeRanking[i][0]; if (i == m_numToSelect - 1) { m_selectionResults.append(((int)m_attributeRanking[i][0] + 1) + " : " + (i + 1) + "\n"); } else { m_selectionResults.append(((int)m_attributeRanking[i][0] + 1)); m_selectionResults.append(","); } } } else { // set up the selected attributes array - usable by a filter or // whatever if ((!(m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) && !(m_ASEvaluator instanceof UnsupervisedAttributeEvaluator)) || m_trainInstances.classIndex() >= 0) // one more for the class { m_selectedAttributeSet = new int[attributeSet.length + 1]; m_selectedAttributeSet[attributeSet.length] = m_trainInstances.classIndex(); } else { m_selectedAttributeSet = new int[attributeSet.length]; } for (int i = 0; i < attributeSet.length; i++) { m_selectedAttributeSet[i] = attributeSet[i]; } m_selectionResults.append("Selected attributes: "); for (int i = 0; i < attributeSet.length; i++) { if (i == (attributeSet.length - 1)) { m_selectionResults.append((attributeSet[i] + 1) + " : " + attributeSet.length + "\n"); } else { m_selectionResults.append((attributeSet[i] + 1) + ","); } } for (int i=0;i train.numAttributes()))) { throw new Exception("Class index out of range."); } if (classIndex != -1) { train.setClassIndex(classIndex - 1); } else { // classIndex = train.numAttributes(); // train.setClassIndex(classIndex - 1); } foldsString = Utils.getOption('x', options); if (foldsString.length() != 0) { folds = Integer.parseInt(foldsString); doCrossVal = true; } trainSelector.setFolds(folds); trainSelector.setXval(doCrossVal); seedString = Utils.getOption('n', options); if (seedString.length() != 0) { seed = Integer.parseInt(seedString); } trainSelector.setSeed(seed); searchName = Utils.getOption('s', options); if ((searchName.length() == 0) && (!(ASEvaluator instanceof AttributeEvaluator))) { throw new Exception("No search method given."); } if (searchName.length() != 0) { searchName = searchName.trim(); // split off any search options int breakLoc = searchName.indexOf(' '); searchClassName = searchName; String searchOptionsString = ""; if (breakLoc != -1) { searchClassName = searchName.substring(0, breakLoc); searchOptionsString = searchName.substring(breakLoc).trim(); searchOptions = Utils.splitOptions(searchOptionsString); } } else { try { searchClassName = new String("weka.attributeSelection.Ranker"); searchMethod = (ASSearch)Class. forName(searchClassName).newInstance(); } catch (Exception e) { throw new Exception("Can't create Ranker object"); } } // if evaluator is a subset evaluator // create search method and set its options (if any) if (searchMethod == null) { searchMethod = ASSearch.forName(searchClassName, searchOptions); } // set the search method trainSelector.setSearch(searchMethod); } catch (Exception e) { throw new Exception('\n' + e.getMessage() + makeOptionString(ASEvaluator, searchMethod)); } try { // Set options for ASEvaluator if (ASEvaluator instanceof OptionHandler) { ((OptionHandler)ASEvaluator).setOptions(options); } /* // Set options for Search method if (searchMethod instanceof OptionHandler) { if (searchOptions != null) { ((OptionHandler)searchMethod).setOptions(searchOptions); } } Utils.checkForRemainingOptions(searchOptions); */ } catch (Exception e) { throw new Exception("\n" + e.getMessage() + makeOptionString(ASEvaluator, searchMethod)); } try { Utils.checkForRemainingOptions(options); } catch (Exception e) { throw new Exception('\n' + e.getMessage() + makeOptionString(ASEvaluator, searchMethod)); } if (helpRequested) { System.out.println(makeOptionString(ASEvaluator, searchMethod)); System.exit(0); } // set the attribute evaluator trainSelector.setEvaluator(ASEvaluator); // do the attribute selection trainSelector.SelectAttributes(train); // return the results string return trainSelector.toResultsString(); } /** * Assembles a text description of the attribute selection results. * * @return a string describing the results of attribute selection. */ private String printSelectionResults () { StringBuffer text = new StringBuffer(); text.append("\n\n=== Attribute Selection on all input data ===\n\n" + "Search Method:\n"); text.append(m_searchMethod.toString()); text.append("\nAttribute "); if (m_ASEvaluator instanceof SubsetEvaluator) { text.append("Subset Evaluator ("); } else { text.append("Evaluator ("); } if (!(m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) && !(m_ASEvaluator instanceof UnsupervisedAttributeEvaluator)) { text.append("supervised, "); text.append("Class ("); if (m_trainInstances.attribute(m_trainInstances.classIndex()) .isNumeric()) { text.append("numeric): "); } else { text.append("nominal): "); } text.append((m_trainInstances.classIndex() + 1) + " " + m_trainInstances.attribute(m_trainInstances .classIndex()).name() + "):\n"); } else { text.append("unsupervised):\n"); } text.append(m_ASEvaluator.toString() + "\n"); return text.toString(); } /** * Make up the help string giving all the command line options * * @param ASEvaluator the attribute evaluator to include options for * @param searchMethod the search method to include options for * @return a string detailing the valid command line options * @throws Exception if something goes wrong */ private static String makeOptionString (ASEvaluation ASEvaluator, ASSearch searchMethod) throws Exception { StringBuffer optionsText = new StringBuffer(""); // General options optionsText.append("\n\nGeneral options:\n\n"); optionsText.append("-h\n\tdisplay this help\n"); optionsText.append("-i \n"); optionsText.append("\tSets training file.\n"); optionsText.append("-c \n"); optionsText.append("\tSets the class index for supervised attribute\n"); optionsText.append("\tselection. Default=last column.\n"); optionsText.append("-s \n"); optionsText.append("\tSets search method for subset evaluators.\n"); optionsText.append("-x \n"); optionsText.append("\tPerform a cross validation.\n"); optionsText.append("-n \n"); optionsText.append("\tUse in conjunction with -x.\n"); // Get attribute evaluator-specific options if (ASEvaluator instanceof OptionHandler) { optionsText.append("\nOptions specific to " + ASEvaluator.getClass().getName() + ":\n\n"); Enumeration enu = ((OptionHandler)ASEvaluator).listOptions(); while (enu.hasMoreElements()) { Option option = (Option)enu.nextElement(); optionsText.append(option.synopsis() + '\n'); optionsText.append(option.description() + "\n"); } } if (searchMethod != null) { if (searchMethod instanceof OptionHandler) { optionsText.append("\nOptions specific to " + searchMethod.getClass().getName() + ":\n\n"); Enumeration enu = ((OptionHandler)searchMethod).listOptions(); while (enu.hasMoreElements()) { Option option = (Option)enu.nextElement(); optionsText.append(option.synopsis() + '\n'); optionsText.append(option.description() + "\n"); } } } else { if (ASEvaluator instanceof SubsetEvaluator) { System.out.println("No search method given."); } } return optionsText.toString(); } /** * Main method for testing this class. * * @param args the options */ public static void main (String[] args) { try { if (args.length == 0) { throw new Exception("The first argument must be the name of an " + "attribute/subset evaluator"); } String EvaluatorName = args[0]; args[0] = ""; ASEvaluation newEval = ASEvaluation.forName(EvaluatorName, null); System.out.println(SelectAttributes(newEval, args)); } catch (Exception e) { System.out.println(e.getMessage()); } } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 1.47 $"); } }