/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * PrincipalComponents.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */ package weka.attributeSelection; import java.util.Enumeration; import java.util.Vector; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.FastVector; import weka.core.Instance; import weka.core.DenseInstance; import weka.core.Instances; import weka.core.Matrix; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; import weka.core.Capabilities.Capability; import weka.filters.Filter; import weka.filters.unsupervised.attribute.NominalToBinary; import weka.filters.unsupervised.attribute.Normalize; import weka.filters.unsupervised.attribute.Remove; import weka.filters.unsupervised.attribute.ReplaceMissingValues; /** * Performs a principal components analysis and transformation of the data. Use in conjunction with a Ranker search. Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data---default 0.95 (95%). Attribute noise can be filtered by transforming to the PC space, eliminating some of the worst eigenvectors, and then transforming back to the original space. *
* * Valid options are: * *-D * Don't normalize input data.* *
-R * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default = 0.95)* *
-O * Transform through the PC space and * back to the original space.* *
-A * Maximum number of attributes to include in * transformed attribute names. (-1 = include all)* * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 5987 $ */ public class PrincipalComponents extends UnsupervisedAttributeEvaluator implements AttributeTransformer, OptionHandler { /** for serialization */ static final long serialVersionUID = 3310137541055815078L; /** The data to transform analyse/transform */ private Instances m_trainInstances; /** Keep a copy for the class attribute (if set) */ private Instances m_trainHeader; /** The header for the transformed data format */ private Instances m_transformedFormat; /** The header for data transformed back to the original space */ private Instances m_originalSpaceFormat; /** Data has a class set */ private boolean m_hasClass; /** Class index */ private int m_classIndex; /** Number of attributes */ private int m_numAttribs; /** Number of instances */ private int m_numInstances; /** Correlation matrix for the original data */ private double [][] m_correlation; /** Will hold the unordered linear transformations of the (normalized) original data */ private double [][] m_eigenvectors; /** Eigenvalues for the corresponding eigenvectors */ private double [] m_eigenvalues = null; /** Sorted eigenvalues */ private int [] m_sortedEigens; /** sum of the eigenvalues */ private double m_sumOfEigenValues = 0.0; /** Filters for original data */ private ReplaceMissingValues m_replaceMissingFilter; private Normalize m_normalizeFilter; private NominalToBinary m_nominalToBinFilter; private Remove m_attributeFilter; /** used to remove the class column if a class column is set */ private Remove m_attribFilter; /** The number of attributes in the pc transformed data */ private int m_outputNumAtts = -1; /** normalize the input data? */ private boolean m_normalize = true; /** the amount of varaince to cover in the original data when retaining the best n PC's */ private double m_coverVariance = 0.95; /** transform the data through the pc space and back to the original space ? */ private boolean m_transBackToOriginal = false; /** maximum number of attributes in the transformed attribute name */ private int m_maxAttrsInName = 5; /** holds the transposed eigenvectors for converting back to the original space */ private double [][] m_eTranspose; /** * Returns a string describing this attribute transformer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Performs a principal components analysis and transformation of " +"the data. Use in conjunction with a Ranker search. Dimensionality " +"reduction is accomplished by choosing enough eigenvectors to " +"account for some percentage of the variance in the original data---" +"default 0.95 (95%). Attribute noise can be filtered by transforming " +"to the PC space, eliminating some of the worst eigenvectors, and " +"then transforming back to the original space."; } /** * Returns an enumeration describing the available options.
* * @return an enumeration of all the available options. **/ public Enumeration listOptions () { Vector newVector = new Vector(3); newVector.addElement(new Option("\tDon't normalize input data." , "D", 0, "-D")); newVector.addElement(new Option("\tRetain enough PC attributes to account " +"\n\tfor this proportion of variance in " +"the original data.\n" + "\t(default = 0.95)", "R",1,"-R")); newVector.addElement(new Option("\tTransform through the PC space and " +"\n\tback to the original space." , "O", 0, "-O")); newVector.addElement(new Option("\tMaximum number of attributes to include in " + "\n\ttransformed attribute names. (-1 = include all)" , "A", 1, "-A")); return newVector.elements(); } /** * Parses a given list of options.
* * Valid options are: * *-D * Don't normalize input data.* *
-R * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default = 0.95)* *
-O * Transform through the PC space and * back to the original space.* *
-A * Maximum number of attributes to include in * transformed attribute names. (-1 = include all)* * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions (String[] options) throws Exception { resetOptions(); String optionString; optionString = Utils.getOption('R', options); if (optionString.length() != 0) { Double temp; temp = Double.valueOf(optionString); setVarianceCovered(temp.doubleValue()); } optionString = Utils.getOption('A', options); if (optionString.length() != 0) { setMaximumAttributeNames(Integer.parseInt(optionString)); } setNormalize(!Utils.getFlag('D', options)); setTransformBackToOriginal(Utils.getFlag('O', options)); } /** * Reset to defaults */ private void resetOptions() { m_coverVariance = 0.95; m_normalize = true; m_sumOfEigenValues = 0.0; m_transBackToOriginal = false; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String normalizeTipText() { return "Normalize input data."; } /** * Set whether input data will be normalized. * @param n true if input data is to be normalized */ public void setNormalize(boolean n) { m_normalize = n; } /** * Gets whether or not input data is to be normalized * @return true if input data is to be normalized */ public boolean getNormalize() { return m_normalize; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String varianceCoveredTipText() { return "Retain enough PC attributes to account for this proportion of " +"variance."; } /** * Sets the amount of variance to account for when retaining * principal components * @param vc the proportion of total variance to account for */ public void setVarianceCovered(double vc) { m_coverVariance = vc; } /** * Gets the proportion of total variance to account for when * retaining principal components * @return the proportion of variance to account for */ public double getVarianceCovered() { return m_coverVariance; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maximumAttributeNamesTipText() { return "The maximum number of attributes to include in transformed attribute names."; } /** * Sets maximum number of attributes to include in * transformed attribute names. * @param m the maximum number of attributes */ public void setMaximumAttributeNames(int m) { m_maxAttrsInName = m; } /** * Gets maximum number of attributes to include in * transformed attribute names. * @return the maximum number of attributes */ public int getMaximumAttributeNames() { return m_maxAttrsInName; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String transformBackToOriginalTipText() { return "Transform through the PC space and back to the original space. " +"If only the best n PCs are retained (by setting varianceCovered < 1) " +"then this option will give a dataset in the original space but with " +"less attribute noise."; } /** * Sets whether the data should be transformed back to the original * space * @param b true if the data should be transformed back to the * original space */ public void setTransformBackToOriginal(boolean b) { m_transBackToOriginal = b; } /** * Gets whether the data is to be transformed back to the original * space. * @return true if the data is to be transformed back to the original space */ public boolean getTransformBackToOriginal() { return m_transBackToOriginal; } /** * Gets the current settings of PrincipalComponents * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { String[] options = new String[6]; int current = 0; if (!getNormalize()) { options[current++] = "-D"; } options[current++] = "-R"; options[current++] = ""+getVarianceCovered(); options[current++] = "-A"; options[current++] = ""+getMaximumAttributeNames(); if (getTransformBackToOriginal()) { options[current++] = "-O"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Initializes principal components and performs the analysis * @param data the instances to analyse/transform * @throws Exception if analysis fails */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); buildAttributeConstructor(data); } private void buildAttributeConstructor (Instances data) throws Exception { m_eigenvalues = null; m_outputNumAtts = -1; m_attributeFilter = null; m_nominalToBinFilter = null; m_sumOfEigenValues = 0.0; m_trainInstances = new Instances(data); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_trainHeader = new Instances(m_trainInstances, 0); m_replaceMissingFilter = new ReplaceMissingValues(); m_replaceMissingFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter); if (m_normalize) { m_normalizeFilter = new Normalize(); m_normalizeFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter); } m_nominalToBinFilter = new NominalToBinary(); m_nominalToBinFilter.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinFilter); // delete any attributes with only one distinct value or are all missing Vector deleteCols = new Vector(); for (int i=0;i