| 1 | /* | 
|---|
| 2 |  *    This program is free software; you can redistribute it and/or modify | 
|---|
| 3 |  *    it under the terms of the GNU General Public License as published by | 
|---|
| 4 |  *    the Free Software Foundation; either version 2 of the License, or | 
|---|
| 5 |  *    (at your option) any later version. | 
|---|
| 6 |  * | 
|---|
| 7 |  *    This program is distributed in the hope that it will be useful, | 
|---|
| 8 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 9 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|---|
| 10 |  *    GNU General Public License for more details. | 
|---|
| 11 |  * | 
|---|
| 12 |  *    You should have received a copy of the GNU General Public License | 
|---|
| 13 |  *    along with this program; if not, write to the Free Software | 
|---|
| 14 |  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 
|---|
| 15 |  */ | 
|---|
| 16 |  | 
|---|
| 17 | /* | 
|---|
| 18 |  * PrincipalComponents.java | 
|---|
| 19 |  * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand | 
|---|
| 20 |  */ | 
|---|
| 21 |  | 
|---|
| 22 | package weka.filters.unsupervised.attribute; | 
|---|
| 23 |  | 
|---|
| 24 | import weka.core.Attribute; | 
|---|
| 25 | import weka.core.Capabilities; | 
|---|
| 26 | import weka.core.FastVector; | 
|---|
| 27 | import weka.core.Instance;  | 
|---|
| 28 | import weka.core.DenseInstance; | 
|---|
| 29 | import weka.core.Instances; | 
|---|
| 30 | import weka.core.Option; | 
|---|
| 31 | import weka.core.OptionHandler; | 
|---|
| 32 | import weka.core.RevisionUtils; | 
|---|
| 33 | import weka.core.SparseInstance; | 
|---|
| 34 | import weka.core.Utils; | 
|---|
| 35 | import weka.core.Capabilities.Capability; | 
|---|
| 36 | import weka.core.matrix.EigenvalueDecomposition; | 
|---|
| 37 | import weka.core.matrix.Matrix; | 
|---|
| 38 | import weka.filters.Filter; | 
|---|
| 39 | import weka.filters.UnsupervisedFilter; | 
|---|
| 40 |  | 
|---|
| 41 | import java.util.Enumeration; | 
|---|
| 42 | import java.util.Vector; | 
|---|
| 43 |  | 
|---|
| 44 | /** | 
|---|
| 45 |  <!-- globalinfo-start --> | 
|---|
| 46 |  * Performs a principal components analysis and transformation of the data.<br/> | 
|---|
| 47 |  * Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data -- default 0.95 (95%).<br/> | 
|---|
| 48 |  * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark Hall and Gabi Schmidberger. | 
|---|
| 49 |  * <p/> | 
|---|
| 50 |  <!-- globalinfo-end --> | 
|---|
| 51 |  *  | 
|---|
| 52 |  <!-- options-start --> | 
|---|
| 53 |  * Valid options are: <p/> | 
|---|
| 54 |  *  | 
|---|
| 55 |  * <pre> -D | 
|---|
| 56 |  *  Don't normalize input data.</pre> | 
|---|
| 57 |  *  | 
|---|
| 58 |  * <pre> -R <num> | 
|---|
| 59 |  *  Retain enough PC attributes to account | 
|---|
| 60 |  *  for this proportion of variance in the original data. | 
|---|
| 61 |  *  (default: 0.95)</pre> | 
|---|
| 62 |  *  | 
|---|
| 63 |  * <pre> -A <num> | 
|---|
| 64 |  *  Maximum number of attributes to include in  | 
|---|
| 65 |  *  transformed attribute names. | 
|---|
| 66 |  *  (-1 = include all, default: 5)</pre> | 
|---|
| 67 |  *  | 
|---|
| 68 |  * <pre> -M <num> | 
|---|
| 69 |  *  Maximum number of PC attributes to retain. | 
|---|
| 70 |  *  (-1 = include all, default: -1)</pre> | 
|---|
| 71 |  *  | 
|---|
| 72 |  <!-- options-end --> | 
|---|
| 73 |  * | 
|---|
| 74 |  * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code | 
|---|
| 75 |  * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code | 
|---|
| 76 |  * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code | 
|---|
| 77 |  * @version $Revision: 5987 $ | 
|---|
| 78 |  */ | 
|---|
| 79 | public class PrincipalComponents | 
|---|
| 80 |   extends Filter | 
|---|
| 81 |   implements OptionHandler, UnsupervisedFilter { | 
|---|
| 82 |  | 
|---|
| 83 |   /** for serialization. */ | 
|---|
| 84 |   private static final long serialVersionUID = 4626939780964387784L; | 
|---|
| 85 |  | 
|---|
| 86 |   /** The data to transform analyse/transform. */ | 
|---|
| 87 |   protected Instances m_TrainInstances; | 
|---|
| 88 |  | 
|---|
| 89 |   /** Keep a copy for the class attribute (if set). */ | 
|---|
| 90 |   protected Instances m_TrainCopy; | 
|---|
| 91 |  | 
|---|
| 92 |   /** The header for the transformed data format. */ | 
|---|
| 93 |   protected Instances m_TransformedFormat; | 
|---|
| 94 |  | 
|---|
| 95 |   /** Data has a class set. */ | 
|---|
| 96 |   protected boolean m_HasClass; | 
|---|
| 97 |  | 
|---|
| 98 |   /** Class index. */ | 
|---|
| 99 |   protected int m_ClassIndex; | 
|---|
| 100 |  | 
|---|
| 101 |   /** Number of attributes. */ | 
|---|
| 102 |   protected int m_NumAttribs; | 
|---|
| 103 |  | 
|---|
| 104 |   /** Number of instances. */ | 
|---|
| 105 |   protected int m_NumInstances; | 
|---|
| 106 |  | 
|---|
| 107 |   /** Correlation matrix for the original data. */ | 
|---|
| 108 |   protected double[][] m_Correlation; | 
|---|
| 109 |  | 
|---|
| 110 |   /** Will hold the unordered linear transformations of the (normalized) | 
|---|
| 111 |       original data. */ | 
|---|
| 112 |   protected double[][] m_Eigenvectors; | 
|---|
| 113 |  | 
|---|
| 114 |   /** Eigenvalues for the corresponding eigenvectors. */ | 
|---|
| 115 |   protected double[] m_Eigenvalues = null; | 
|---|
| 116 |  | 
|---|
| 117 |   /** Sorted eigenvalues. */ | 
|---|
| 118 |   protected int[] m_SortedEigens; | 
|---|
| 119 |  | 
|---|
| 120 |   /** sum of the eigenvalues. */ | 
|---|
| 121 |   protected double m_SumOfEigenValues = 0.0; | 
|---|
| 122 |  | 
|---|
| 123 |   /** Filters for replacing missing values. */ | 
|---|
| 124 |   protected ReplaceMissingValues m_ReplaceMissingFilter; | 
|---|
| 125 |    | 
|---|
| 126 |   /** Filter for normalizing the data. */ | 
|---|
| 127 |   protected Normalize m_NormalizeFilter; | 
|---|
| 128 |    | 
|---|
| 129 |   /** Filter for turning nominal values into numeric ones. */ | 
|---|
| 130 |   protected NominalToBinary m_NominalToBinaryFilter; | 
|---|
| 131 |    | 
|---|
| 132 |   /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */ | 
|---|
| 133 |   protected Remove m_AttributeFilter; | 
|---|
| 134 |  | 
|---|
| 135 |   /** The number of attributes in the pc transformed data. */ | 
|---|
| 136 |   protected int m_OutputNumAtts = -1; | 
|---|
| 137 |  | 
|---|
| 138 |   /** normalize the input data? */ | 
|---|
| 139 |   protected boolean m_Normalize = true; | 
|---|
| 140 |  | 
|---|
| 141 |   /** the amount of varaince to cover in the original data when | 
|---|
| 142 |       retaining the best n PC's. */ | 
|---|
| 143 |   protected double m_CoverVariance = 0.95; | 
|---|
| 144 |  | 
|---|
| 145 |   /** maximum number of attributes in the transformed attribute name. */ | 
|---|
| 146 |   protected int m_MaxAttrsInName = 5; | 
|---|
| 147 |  | 
|---|
| 148 |   /** maximum number of attributes in the transformed data (-1 for all). */ | 
|---|
| 149 |   protected int m_MaxAttributes = -1; | 
|---|
| 150 |  | 
|---|
| 151 |   /** | 
|---|
| 152 |    * Returns a string describing this filter. | 
|---|
| 153 |    * | 
|---|
| 154 |    * @return            a description of the filter suitable for | 
|---|
| 155 |    *                    displaying in the explorer/experimenter gui | 
|---|
| 156 |    */ | 
|---|
| 157 |   public String globalInfo() { | 
|---|
| 158 |     return  | 
|---|
| 159 |         "Performs a principal components analysis and transformation of " | 
|---|
| 160 |       + "the data.\n" | 
|---|
| 161 |       + "Dimensionality reduction is accomplished by choosing enough eigenvectors " | 
|---|
| 162 |       + "to account for some percentage of the variance in the original data -- " | 
|---|
| 163 |       + "default 0.95 (95%).\n" | 
|---|
| 164 |       + "Based on code of the attribute selection scheme 'PrincipalComponents' " | 
|---|
| 165 |       + "by Mark Hall and Gabi Schmidberger."; | 
|---|
| 166 |   } | 
|---|
| 167 |  | 
|---|
| 168 |   /** | 
|---|
| 169 |    * Returns an enumeration describing the available options. | 
|---|
| 170 |    * | 
|---|
| 171 |    * @return            an enumeration of all the available options. | 
|---|
| 172 |    */ | 
|---|
| 173 |   public Enumeration listOptions() { | 
|---|
| 174 |     Vector result = new Vector(); | 
|---|
| 175 |  | 
|---|
| 176 |     result.addElement(new Option( | 
|---|
| 177 |         "\tDon't normalize input data.",  | 
|---|
| 178 |         "D", 0, "-D")); | 
|---|
| 179 |  | 
|---|
| 180 |     result.addElement(new Option( | 
|---|
| 181 |         "\tRetain enough PC attributes to account\n" | 
|---|
| 182 |         +"\tfor this proportion of variance in the original data.\n" | 
|---|
| 183 |         + "\t(default: 0.95)", | 
|---|
| 184 |         "R", 1, "-R <num>")); | 
|---|
| 185 |  | 
|---|
| 186 |     result.addElement(new Option( | 
|---|
| 187 |         "\tMaximum number of attributes to include in \n" | 
|---|
| 188 |         + "\ttransformed attribute names.\n" | 
|---|
| 189 |         + "\t(-1 = include all, default: 5)",  | 
|---|
| 190 |         "A", 1, "-A <num>")); | 
|---|
| 191 |  | 
|---|
| 192 |     result.addElement(new Option( | 
|---|
| 193 |         "\tMaximum number of PC attributes to retain.\n" | 
|---|
| 194 |         + "\t(-1 = include all, default: -1)",  | 
|---|
| 195 |         "M", 1, "-M <num>")); | 
|---|
| 196 |  | 
|---|
| 197 |     return result.elements(); | 
|---|
| 198 |   } | 
|---|
| 199 |  | 
|---|
| 200 |   /** | 
|---|
| 201 |    * Parses a list of options for this object. <p/> | 
|---|
| 202 |    * | 
|---|
| 203 |    <!-- options-start --> | 
|---|
| 204 |    * Valid options are: <p/> | 
|---|
| 205 |    *  | 
|---|
| 206 |    * <pre> -D | 
|---|
| 207 |    *  Don't normalize input data.</pre> | 
|---|
| 208 |    *  | 
|---|
| 209 |    * <pre> -R <num> | 
|---|
| 210 |    *  Retain enough PC attributes to account | 
|---|
| 211 |    *  for this proportion of variance in the original data. | 
|---|
| 212 |    *  (default: 0.95)</pre> | 
|---|
| 213 |    *  | 
|---|
| 214 |    * <pre> -A <num> | 
|---|
| 215 |    *  Maximum number of attributes to include in  | 
|---|
| 216 |    *  transformed attribute names. | 
|---|
| 217 |    *  (-1 = include all, default: 5)</pre> | 
|---|
| 218 |    *  | 
|---|
| 219 |    * <pre> -M <num> | 
|---|
| 220 |    *  Maximum number of PC attributes to retain. | 
|---|
| 221 |    *  (-1 = include all, default: -1)</pre> | 
|---|
| 222 |    *  | 
|---|
| 223 |    <!-- options-end --> | 
|---|
| 224 |    * | 
|---|
| 225 |    * @param options     the list of options as an array of strings | 
|---|
| 226 |    * @throws Exception  if an option is not supported | 
|---|
| 227 |    */ | 
|---|
| 228 |   public void setOptions(String[] options) throws Exception { | 
|---|
| 229 |     String        tmpStr; | 
|---|
| 230 |  | 
|---|
| 231 |     tmpStr = Utils.getOption('R', options); | 
|---|
| 232 |     if (tmpStr.length() != 0) | 
|---|
| 233 |       setVarianceCovered(Double.parseDouble(tmpStr)); | 
|---|
| 234 |     else | 
|---|
| 235 |       setVarianceCovered(0.95); | 
|---|
| 236 |  | 
|---|
| 237 |     tmpStr = Utils.getOption('A', options); | 
|---|
| 238 |     if (tmpStr.length() != 0) | 
|---|
| 239 |       setMaximumAttributeNames(Integer.parseInt(tmpStr)); | 
|---|
| 240 |     else | 
|---|
| 241 |       setMaximumAttributeNames(5); | 
|---|
| 242 |  | 
|---|
| 243 |     tmpStr = Utils.getOption('M', options); | 
|---|
| 244 |     if (tmpStr.length() != 0) | 
|---|
| 245 |       setMaximumAttributes(Integer.parseInt(tmpStr)); | 
|---|
| 246 |     else | 
|---|
| 247 |       setMaximumAttributes(-1); | 
|---|
| 248 |  | 
|---|
| 249 |     setNormalize(!Utils.getFlag('D', options)); | 
|---|
| 250 |   } | 
|---|
| 251 |  | 
|---|
| 252 |   /** | 
|---|
| 253 |    * Gets the current settings of the filter. | 
|---|
| 254 |    * | 
|---|
| 255 |    * @return            an array of strings suitable for passing to setOptions | 
|---|
| 256 |    */ | 
|---|
| 257 |   public String[] getOptions() { | 
|---|
| 258 |     Vector<String>      result; | 
|---|
| 259 |  | 
|---|
| 260 |     result = new Vector<String>(); | 
|---|
| 261 |  | 
|---|
| 262 |     result.add("-R"); | 
|---|
| 263 |     result.add("" + getVarianceCovered()); | 
|---|
| 264 |  | 
|---|
| 265 |     result.add("-A"); | 
|---|
| 266 |     result.add("" + getMaximumAttributeNames()); | 
|---|
| 267 |  | 
|---|
| 268 |     result.add("-M"); | 
|---|
| 269 |     result.add("" + getMaximumAttributes()); | 
|---|
| 270 |  | 
|---|
| 271 |     if (!getNormalize()) | 
|---|
| 272 |       result.add("-D"); | 
|---|
| 273 |  | 
|---|
| 274 |     return result.toArray(new String[result.size()]); | 
|---|
| 275 |   } | 
|---|
| 276 |  | 
|---|
| 277 |   /** | 
|---|
| 278 |    * Returns the tip text for this property. | 
|---|
| 279 |    *  | 
|---|
| 280 |    * @return            tip text for this property suitable for | 
|---|
| 281 |    *                    displaying in the explorer/experimenter gui | 
|---|
| 282 |    */ | 
|---|
| 283 |   public String normalizeTipText() { | 
|---|
| 284 |     return "Normalize input data."; | 
|---|
| 285 |   } | 
|---|
| 286 |  | 
|---|
| 287 |   /** | 
|---|
| 288 |    * Set whether input data will be normalized. | 
|---|
| 289 |    *  | 
|---|
| 290 |    * @param value       true if input data is to be normalized | 
|---|
| 291 |    */ | 
|---|
| 292 |   public void setNormalize(boolean value) { | 
|---|
| 293 |     m_Normalize = value; | 
|---|
| 294 |   } | 
|---|
| 295 |  | 
|---|
| 296 |   /** | 
|---|
| 297 |    * Gets whether or not input data is to be normalized. | 
|---|
| 298 |    *  | 
|---|
| 299 |    * @return            true if input data is to be normalized | 
|---|
| 300 |    */ | 
|---|
| 301 |   public boolean getNormalize() { | 
|---|
| 302 |     return m_Normalize; | 
|---|
| 303 |   } | 
|---|
| 304 |  | 
|---|
| 305 |   /** | 
|---|
| 306 |    * Returns the tip text for this property. | 
|---|
| 307 |    *  | 
|---|
| 308 |    * @return            tip text for this property suitable for | 
|---|
| 309 |    *                    displaying in the explorer/experimenter gui | 
|---|
| 310 |    */ | 
|---|
| 311 |   public String varianceCoveredTipText() { | 
|---|
| 312 |     return "Retain enough PC attributes to account for this proportion of variance."; | 
|---|
| 313 |   } | 
|---|
| 314 |  | 
|---|
| 315 |   /** | 
|---|
| 316 |    * Sets the amount of variance to account for when retaining | 
|---|
| 317 |    * principal components. | 
|---|
| 318 |    *  | 
|---|
| 319 |    * @param value       the proportion of total variance to account for | 
|---|
| 320 |    */ | 
|---|
| 321 |   public void setVarianceCovered(double value) { | 
|---|
| 322 |     m_CoverVariance = value; | 
|---|
| 323 |   } | 
|---|
| 324 |  | 
|---|
| 325 |   /** | 
|---|
| 326 |    * Gets the proportion of total variance to account for when | 
|---|
| 327 |    * retaining principal components. | 
|---|
| 328 |    *  | 
|---|
| 329 |    * @return            the proportion of variance to account for | 
|---|
| 330 |    */ | 
|---|
| 331 |   public double getVarianceCovered() { | 
|---|
| 332 |     return m_CoverVariance; | 
|---|
| 333 |   } | 
|---|
| 334 |  | 
|---|
| 335 |   /** | 
|---|
| 336 |    * Returns the tip text for this property. | 
|---|
| 337 |    *  | 
|---|
| 338 |    * @return            tip text for this property suitable for | 
|---|
| 339 |    *                    displaying in the explorer/experimenter gui | 
|---|
| 340 |    */ | 
|---|
| 341 |   public String maximumAttributeNamesTipText() { | 
|---|
| 342 |     return "The maximum number of attributes to include in transformed attribute names."; | 
|---|
| 343 |   } | 
|---|
| 344 |  | 
|---|
| 345 |   /** | 
|---|
| 346 |    * Sets maximum number of attributes to include in | 
|---|
| 347 |    * transformed attribute names. | 
|---|
| 348 |    *  | 
|---|
| 349 |    * @param value       the maximum number of attributes | 
|---|
| 350 |    */ | 
|---|
| 351 |   public void setMaximumAttributeNames(int value) { | 
|---|
| 352 |     m_MaxAttrsInName = value; | 
|---|
| 353 |   } | 
|---|
| 354 |  | 
|---|
| 355 |   /** | 
|---|
| 356 |    * Gets maximum number of attributes to include in | 
|---|
| 357 |    * transformed attribute names. | 
|---|
| 358 |    *  | 
|---|
| 359 |    * @return            the maximum number of attributes | 
|---|
| 360 |    */ | 
|---|
| 361 |   public int getMaximumAttributeNames() { | 
|---|
| 362 |     return m_MaxAttrsInName; | 
|---|
| 363 |   } | 
|---|
| 364 |  | 
|---|
| 365 |   /** | 
|---|
| 366 |    * Returns the tip text for this property. | 
|---|
| 367 |    *  | 
|---|
| 368 |    * @return            tip text for this property suitable for | 
|---|
| 369 |    *                    displaying in the explorer/experimenter gui | 
|---|
| 370 |    */ | 
|---|
| 371 |   public String maximumAttributesTipText() { | 
|---|
| 372 |     return "The maximum number of PC attributes to retain."; | 
|---|
| 373 |   } | 
|---|
| 374 |  | 
|---|
| 375 |   /** | 
|---|
| 376 |    * Sets maximum number of PC attributes to retain. | 
|---|
| 377 |    *  | 
|---|
| 378 |    * @param value       the maximum number of attributes | 
|---|
| 379 |    */ | 
|---|
| 380 |   public void setMaximumAttributes(int value) { | 
|---|
| 381 |     m_MaxAttributes = value; | 
|---|
| 382 |   } | 
|---|
| 383 |  | 
|---|
| 384 |   /** | 
|---|
| 385 |    * Gets maximum number of PC attributes to retain. | 
|---|
| 386 |    *  | 
|---|
| 387 |    * @return            the maximum number of attributes | 
|---|
| 388 |    */ | 
|---|
| 389 |   public int getMaximumAttributes() { | 
|---|
| 390 |     return m_MaxAttributes; | 
|---|
| 391 |   } | 
|---|
| 392 |  | 
|---|
| 393 |   /** | 
|---|
| 394 |    * Returns the capabilities of this evaluator. | 
|---|
| 395 |    * | 
|---|
| 396 |    * @return            the capabilities of this evaluator | 
|---|
| 397 |    * @see               Capabilities | 
|---|
| 398 |    */ | 
|---|
| 399 |   public Capabilities getCapabilities() { | 
|---|
| 400 |     Capabilities result = super.getCapabilities(); | 
|---|
| 401 |     result.disableAll(); | 
|---|
| 402 |  | 
|---|
| 403 |     // attributes | 
|---|
| 404 |     result.enable(Capability.NOMINAL_ATTRIBUTES); | 
|---|
| 405 |     result.enable(Capability.NUMERIC_ATTRIBUTES); | 
|---|
| 406 |     result.enable(Capability.DATE_ATTRIBUTES); | 
|---|
| 407 |     result.enable(Capability.MISSING_VALUES); | 
|---|
| 408 |  | 
|---|
| 409 |     // class | 
|---|
| 410 |     result.enable(Capability.NOMINAL_CLASS); | 
|---|
| 411 |     result.enable(Capability.NUMERIC_CLASS); | 
|---|
| 412 |     result.enable(Capability.DATE_CLASS); | 
|---|
| 413 |     result.enable(Capability.MISSING_CLASS_VALUES); | 
|---|
| 414 |     result.enable(Capability.NO_CLASS); | 
|---|
| 415 |  | 
|---|
| 416 |     return result; | 
|---|
| 417 |   } | 
|---|
| 418 |  | 
|---|
| 419 |   /** | 
|---|
| 420 |    * Determines the output format based on the input format and returns  | 
|---|
| 421 |    * this. In case the output format cannot be returned immediately, i.e., | 
|---|
| 422 |    * immediateOutputFormat() returns false, then this method will be called | 
|---|
| 423 |    * from batchFinished(). | 
|---|
| 424 |    * | 
|---|
| 425 |    * @param inputFormat     the input format to base the output format on | 
|---|
| 426 |    * @return                the output format | 
|---|
| 427 |    * @throws Exception      in case the determination goes wrong | 
|---|
| 428 |    * @see   #hasImmediateOutputFormat() | 
|---|
| 429 |    * @see   #batchFinished() | 
|---|
| 430 |    */ | 
|---|
| 431 |   protected Instances determineOutputFormat(Instances inputFormat) throws Exception { | 
|---|
| 432 |     double              cumulative; | 
|---|
| 433 |     FastVector          attributes; | 
|---|
| 434 |     int                 i; | 
|---|
| 435 |     int                 j; | 
|---|
| 436 |     StringBuffer        attName; | 
|---|
| 437 |     double[]            coeff_mags; | 
|---|
| 438 |     int                 num_attrs; | 
|---|
| 439 |     int[]               coeff_inds; | 
|---|
| 440 |     double              coeff_value; | 
|---|
| 441 |     int                 numAttsLowerBound; | 
|---|
| 442 |      | 
|---|
| 443 |     if (m_Eigenvalues == null) | 
|---|
| 444 |       return inputFormat; | 
|---|
| 445 |  | 
|---|
| 446 |     if (m_MaxAttributes > 0) | 
|---|
| 447 |       numAttsLowerBound = m_NumAttribs - m_MaxAttributes; | 
|---|
| 448 |     else | 
|---|
| 449 |       numAttsLowerBound = 0; | 
|---|
| 450 |     if (numAttsLowerBound < 0) | 
|---|
| 451 |       numAttsLowerBound = 0; | 
|---|
| 452 |      | 
|---|
| 453 |     cumulative = 0.0; | 
|---|
| 454 |     attributes = new FastVector(); | 
|---|
| 455 |     for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { | 
|---|
| 456 |       attName = new StringBuffer(); | 
|---|
| 457 |       // build array of coefficients | 
|---|
| 458 |       coeff_mags = new double[m_NumAttribs]; | 
|---|
| 459 |       for (j = 0; j < m_NumAttribs; j++) | 
|---|
| 460 |         coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]); | 
|---|
| 461 |       num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs; | 
|---|
| 462 |  | 
|---|
| 463 |       // this array contains the sorted indices of the coefficients | 
|---|
| 464 |       if (m_NumAttribs > 0) { | 
|---|
| 465 |         // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude | 
|---|
| 466 |         coeff_inds = Utils.sort(coeff_mags); | 
|---|
| 467 |       } | 
|---|
| 468 |       else { | 
|---|
| 469 |         // if  m_maxAttrsInName <= 0, use all coeffs in original order | 
|---|
| 470 |         coeff_inds = new int[m_NumAttribs]; | 
|---|
| 471 |         for (j = 0; j < m_NumAttribs; j++) | 
|---|
| 472 |           coeff_inds[j] = j; | 
|---|
| 473 |       } | 
|---|
| 474 |       // build final attName string | 
|---|
| 475 |       for (j = 0; j < num_attrs; j++) { | 
|---|
| 476 |         coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]]; | 
|---|
| 477 |         if (j > 0 && coeff_value >= 0) | 
|---|
| 478 |           attName.append("+"); | 
|---|
| 479 |         attName.append( | 
|---|
| 480 |             Utils.doubleToString(coeff_value,5,3)  | 
|---|
| 481 |             + inputFormat.attribute(coeff_inds[j]).name()); | 
|---|
| 482 |       } | 
|---|
| 483 |       if (num_attrs < m_NumAttribs) | 
|---|
| 484 |         attName.append("..."); | 
|---|
| 485 |  | 
|---|
| 486 |       attributes.addElement(new Attribute(attName.toString())); | 
|---|
| 487 |       cumulative += m_Eigenvalues[m_SortedEigens[i]]; | 
|---|
| 488 |  | 
|---|
| 489 |       if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) | 
|---|
| 490 |         break; | 
|---|
| 491 |     } | 
|---|
| 492 |  | 
|---|
| 493 |     if (m_HasClass) | 
|---|
| 494 |       attributes.addElement(m_TrainCopy.classAttribute().copy()); | 
|---|
| 495 |  | 
|---|
| 496 |     Instances outputFormat =  | 
|---|
| 497 |       new Instances( | 
|---|
| 498 |           m_TrainCopy.relationName() + "_principal components", attributes, 0); | 
|---|
| 499 |  | 
|---|
| 500 |     // set the class to be the last attribute if necessary | 
|---|
| 501 |     if (m_HasClass) | 
|---|
| 502 |       outputFormat.setClassIndex(outputFormat.numAttributes() - 1); | 
|---|
| 503 |  | 
|---|
| 504 |     m_OutputNumAtts = outputFormat.numAttributes(); | 
|---|
| 505 |      | 
|---|
| 506 |     return outputFormat; | 
|---|
| 507 |   } | 
|---|
| 508 |  | 
|---|
| 509 |   /** | 
|---|
| 510 |    * Fill the correlation matrix. | 
|---|
| 511 |    */ | 
|---|
| 512 |   protected void fillCorrelation() { | 
|---|
| 513 |     int         i; | 
|---|
| 514 |     int         j; | 
|---|
| 515 |     int         k; | 
|---|
| 516 |     double[]    att1; | 
|---|
| 517 |     double[]    att2; | 
|---|
| 518 |     double      corr; | 
|---|
| 519 |      | 
|---|
| 520 |     m_Correlation = new double[m_NumAttribs][m_NumAttribs]; | 
|---|
| 521 |     att1          = new double [m_NumInstances]; | 
|---|
| 522 |     att2          = new double [m_NumInstances]; | 
|---|
| 523 |  | 
|---|
| 524 |     for (i = 0; i < m_NumAttribs; i++) { | 
|---|
| 525 |       for (j = 0; j < m_NumAttribs; j++) { | 
|---|
| 526 |         if (i == j) { | 
|---|
| 527 |           m_Correlation[i][j] = 1.0; | 
|---|
| 528 |         } | 
|---|
| 529 |         else { | 
|---|
| 530 |           for (k = 0; k < m_NumInstances; k++) { | 
|---|
| 531 |             att1[k] = m_TrainInstances.instance(k).value(i); | 
|---|
| 532 |             att2[k] = m_TrainInstances.instance(k).value(j); | 
|---|
| 533 |           } | 
|---|
| 534 |           corr = Utils.correlation(att1,att2,m_NumInstances); | 
|---|
| 535 |           m_Correlation[i][j] = corr; | 
|---|
| 536 |           m_Correlation[j][i] = corr; | 
|---|
| 537 |         } | 
|---|
| 538 |       } | 
|---|
| 539 |     } | 
|---|
| 540 |   } | 
|---|
| 541 |  | 
|---|
| 542 |   /** | 
|---|
| 543 |    * Transform an instance in original (unormalized) format. | 
|---|
| 544 |    *  | 
|---|
| 545 |    * @param instance    an instance in the original (unormalized) format | 
|---|
| 546 |    * @return            a transformed instance | 
|---|
| 547 |    * @throws Exception  if instance can't be transformed | 
|---|
| 548 |    */ | 
|---|
| 549 |   protected Instance convertInstance(Instance instance) throws Exception { | 
|---|
| 550 |     Instance    result; | 
|---|
| 551 |     double[]    newVals; | 
|---|
| 552 |     Instance    tempInst; | 
|---|
| 553 |     double      cumulative; | 
|---|
| 554 |     int         i; | 
|---|
| 555 |     int         j; | 
|---|
| 556 |     double      tempval; | 
|---|
| 557 |     int         numAttsLowerBound; | 
|---|
| 558 |      | 
|---|
| 559 |     newVals  = new double[m_OutputNumAtts]; | 
|---|
| 560 |     tempInst = (Instance) instance.copy(); | 
|---|
| 561 |  | 
|---|
| 562 |     m_ReplaceMissingFilter.input(tempInst); | 
|---|
| 563 |     m_ReplaceMissingFilter.batchFinished(); | 
|---|
| 564 |     tempInst = m_ReplaceMissingFilter.output(); | 
|---|
| 565 |  | 
|---|
| 566 |     if (m_Normalize) { | 
|---|
| 567 |       m_NormalizeFilter.input(tempInst); | 
|---|
| 568 |       m_NormalizeFilter.batchFinished(); | 
|---|
| 569 |       tempInst = m_NormalizeFilter.output(); | 
|---|
| 570 |     } | 
|---|
| 571 |  | 
|---|
| 572 |     m_NominalToBinaryFilter.input(tempInst); | 
|---|
| 573 |     m_NominalToBinaryFilter.batchFinished(); | 
|---|
| 574 |     tempInst = m_NominalToBinaryFilter.output(); | 
|---|
| 575 |  | 
|---|
| 576 |     if (m_AttributeFilter != null) { | 
|---|
| 577 |       m_AttributeFilter.input(tempInst); | 
|---|
| 578 |       m_AttributeFilter.batchFinished(); | 
|---|
| 579 |       tempInst = m_AttributeFilter.output(); | 
|---|
| 580 |     } | 
|---|
| 581 |  | 
|---|
| 582 |     if (m_HasClass) | 
|---|
| 583 |       newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); | 
|---|
| 584 |  | 
|---|
| 585 |     if (m_MaxAttributes > 0) | 
|---|
| 586 |       numAttsLowerBound = m_NumAttribs - m_MaxAttributes; | 
|---|
| 587 |     else | 
|---|
| 588 |       numAttsLowerBound = 0; | 
|---|
| 589 |     if (numAttsLowerBound < 0) | 
|---|
| 590 |       numAttsLowerBound = 0; | 
|---|
| 591 |      | 
|---|
| 592 |     cumulative = 0; | 
|---|
| 593 |     for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { | 
|---|
| 594 |       tempval = 0.0; | 
|---|
| 595 |       for (j = 0; j < m_NumAttribs; j++) | 
|---|
| 596 |         tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); | 
|---|
| 597 |  | 
|---|
| 598 |       newVals[m_NumAttribs - i - 1] = tempval; | 
|---|
| 599 |       cumulative += m_Eigenvalues[m_SortedEigens[i]]; | 
|---|
| 600 |       if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) | 
|---|
| 601 |         break; | 
|---|
| 602 |     } | 
|---|
| 603 |  | 
|---|
| 604 |     // create instance | 
|---|
| 605 |     if (instance instanceof SparseInstance) | 
|---|
| 606 |       result = new SparseInstance(instance.weight(), newVals); | 
|---|
| 607 |     else | 
|---|
| 608 |       result = new DenseInstance(instance.weight(), newVals); | 
|---|
| 609 |      | 
|---|
| 610 |     return result; | 
|---|
| 611 |   } | 
|---|
| 612 |  | 
|---|
| 613 |   /** | 
|---|
| 614 |    * Initializes the filter with the given input data. | 
|---|
| 615 |    * | 
|---|
| 616 |    * @param instances   the data to process | 
|---|
| 617 |    * @throws Exception  in case the processing goes wrong | 
|---|
| 618 |    * @see               #batchFinished() | 
|---|
| 619 |    */ | 
|---|
| 620 |   protected void setup(Instances instances) throws Exception { | 
|---|
| 621 |     int                         i; | 
|---|
| 622 |     int                         j; | 
|---|
| 623 |     Vector<Integer>             deleteCols; | 
|---|
| 624 |     int[]                       todelete; | 
|---|
| 625 |     double[][]                  v; | 
|---|
| 626 |     Matrix                      corr; | 
|---|
| 627 |     EigenvalueDecomposition     eig; | 
|---|
| 628 |     Matrix                      V; | 
|---|
| 629 |      | 
|---|
| 630 |     m_TrainInstances = new Instances(instances); | 
|---|
| 631 |  | 
|---|
| 632 |     // make a copy of the training data so that we can get the class | 
|---|
| 633 |     // column to append to the transformed data (if necessary) | 
|---|
| 634 |     m_TrainCopy = new Instances(m_TrainInstances, 0); | 
|---|
| 635 |  | 
|---|
| 636 |     m_ReplaceMissingFilter = new ReplaceMissingValues(); | 
|---|
| 637 |     m_ReplaceMissingFilter.setInputFormat(m_TrainInstances); | 
|---|
| 638 |     m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); | 
|---|
| 639 |  | 
|---|
| 640 |     if (m_Normalize) { | 
|---|
| 641 |       m_NormalizeFilter = new Normalize(); | 
|---|
| 642 |       m_NormalizeFilter.setInputFormat(m_TrainInstances); | 
|---|
| 643 |       m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NormalizeFilter); | 
|---|
| 644 |     } | 
|---|
| 645 |  | 
|---|
| 646 |     m_NominalToBinaryFilter = new NominalToBinary(); | 
|---|
| 647 |     m_NominalToBinaryFilter.setInputFormat(m_TrainInstances); | 
|---|
| 648 |     m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); | 
|---|
| 649 |  | 
|---|
| 650 |     // delete any attributes with only one distinct value or are all missing | 
|---|
| 651 |     deleteCols = new Vector<Integer>(); | 
|---|
| 652 |     for (i = 0; i < m_TrainInstances.numAttributes(); i++) { | 
|---|
| 653 |       if (m_TrainInstances.numDistinctValues(i) <= 1) | 
|---|
| 654 |         deleteCols.addElement(i); | 
|---|
| 655 |     } | 
|---|
| 656 |  | 
|---|
| 657 |     if (m_TrainInstances.classIndex() >=0) { | 
|---|
| 658 |       // get rid of the class column | 
|---|
| 659 |       m_HasClass = true; | 
|---|
| 660 |       m_ClassIndex = m_TrainInstances.classIndex(); | 
|---|
| 661 |       deleteCols.addElement(new Integer(m_ClassIndex)); | 
|---|
| 662 |     } | 
|---|
| 663 |  | 
|---|
| 664 |     // remove columns from the data if necessary | 
|---|
| 665 |     if (deleteCols.size() > 0) { | 
|---|
| 666 |       m_AttributeFilter = new Remove(); | 
|---|
| 667 |       todelete = new int [deleteCols.size()]; | 
|---|
| 668 |       for (i = 0; i < deleteCols.size(); i++) | 
|---|
| 669 |         todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue(); | 
|---|
| 670 |       m_AttributeFilter.setAttributeIndicesArray(todelete); | 
|---|
| 671 |       m_AttributeFilter.setInvertSelection(false); | 
|---|
| 672 |       m_AttributeFilter.setInputFormat(m_TrainInstances); | 
|---|
| 673 |       m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter); | 
|---|
| 674 |     } | 
|---|
| 675 |  | 
|---|
| 676 |     // can evaluator handle the processed data ? e.g., enough attributes? | 
|---|
| 677 |     getCapabilities().testWithFail(m_TrainInstances); | 
|---|
| 678 |  | 
|---|
| 679 |     m_NumInstances = m_TrainInstances.numInstances(); | 
|---|
| 680 |     m_NumAttribs   = m_TrainInstances.numAttributes(); | 
|---|
| 681 |  | 
|---|
| 682 |     fillCorrelation(); | 
|---|
| 683 |  | 
|---|
| 684 |     // get eigen vectors/values | 
|---|
| 685 |     corr = new Matrix(m_Correlation); | 
|---|
| 686 |     eig  = corr.eig(); | 
|---|
| 687 |     V    = eig.getV(); | 
|---|
| 688 |     v    = new double[m_NumAttribs][m_NumAttribs]; | 
|---|
| 689 |     for (i = 0; i < v.length; i++) { | 
|---|
| 690 |       for (j = 0; j < v[0].length; j++) | 
|---|
| 691 |         v[i][j] = V.get(i, j); | 
|---|
| 692 |     } | 
|---|
| 693 |     m_Eigenvectors = (double[][]) v.clone(); | 
|---|
| 694 |     m_Eigenvalues  = (double[]) eig.getRealEigenvalues().clone(); | 
|---|
| 695 |  | 
|---|
| 696 |     // any eigenvalues less than 0 are not worth anything --- change to 0 | 
|---|
| 697 |     for (i = 0; i < m_Eigenvalues.length; i++) { | 
|---|
| 698 |       if (m_Eigenvalues[i] < 0) | 
|---|
| 699 |         m_Eigenvalues[i] = 0.0; | 
|---|
| 700 |     } | 
|---|
| 701 |     m_SortedEigens     = Utils.sort(m_Eigenvalues); | 
|---|
| 702 |     m_SumOfEigenValues = Utils.sum(m_Eigenvalues); | 
|---|
| 703 |  | 
|---|
| 704 |     m_TransformedFormat = determineOutputFormat(m_TrainInstances); | 
|---|
| 705 |     setOutputFormat(m_TransformedFormat); | 
|---|
| 706 |      | 
|---|
| 707 |     m_TrainInstances = null; | 
|---|
| 708 |   } | 
|---|
| 709 |  | 
|---|
| 710 |   /** | 
|---|
| 711 |    * Sets the format of the input instances. | 
|---|
| 712 |    * | 
|---|
| 713 |    * @param instanceInfo        an Instances object containing the input  | 
|---|
| 714 |    *                            instance structure (any instances contained  | 
|---|
| 715 |    *                            in the object are ignored - only the structure  | 
|---|
| 716 |    *                            is required). | 
|---|
| 717 |    * @return                    true if the outputFormat may be collected  | 
|---|
| 718 |    *                            immediately | 
|---|
| 719 |    * @throws Exception          if the input format can't be set successfully | 
|---|
| 720 |    */ | 
|---|
| 721 |   public boolean setInputFormat(Instances instanceInfo) throws Exception { | 
|---|
| 722 |     super.setInputFormat(instanceInfo); | 
|---|
| 723 |  | 
|---|
| 724 |     m_Eigenvalues           = null; | 
|---|
| 725 |     m_OutputNumAtts         = -1; | 
|---|
| 726 |     m_AttributeFilter       = null; | 
|---|
| 727 |     m_NominalToBinaryFilter = null; | 
|---|
| 728 |     m_SumOfEigenValues      = 0.0; | 
|---|
| 729 |      | 
|---|
| 730 |     return false; | 
|---|
| 731 |   } | 
|---|
| 732 |  | 
|---|
| 733 |   /** | 
|---|
| 734 |    * Input an instance for filtering. Filter requires all | 
|---|
| 735 |    * training instances be read before producing output. | 
|---|
| 736 |    * | 
|---|
| 737 |    * @param instance                    the input instance | 
|---|
| 738 |    * @return                            true if the filtered instance may now be | 
|---|
| 739 |    *                                    collected with output(). | 
|---|
| 740 |    * @throws IllegalStateException      if no input format has been set | 
|---|
| 741 |    * @throws Exception                  if conversion fails | 
|---|
| 742 |    */ | 
|---|
| 743 |   public boolean input(Instance instance) throws Exception { | 
|---|
| 744 |     Instance    inst; | 
|---|
| 745 |      | 
|---|
| 746 |     if (getInputFormat() == null) | 
|---|
| 747 |       throw new IllegalStateException("No input instance format defined"); | 
|---|
| 748 |  | 
|---|
| 749 |     if (isNewBatch()) { | 
|---|
| 750 |       resetQueue(); | 
|---|
| 751 |       m_NewBatch = false; | 
|---|
| 752 |     } | 
|---|
| 753 |      | 
|---|
| 754 |     if (isFirstBatchDone()) { | 
|---|
| 755 |       inst = convertInstance(instance); | 
|---|
| 756 |       inst.setDataset(getOutputFormat()); | 
|---|
| 757 |       push(inst); | 
|---|
| 758 |       return true; | 
|---|
| 759 |     } | 
|---|
| 760 |     else { | 
|---|
| 761 |       bufferInput(instance); | 
|---|
| 762 |       return false; | 
|---|
| 763 |     } | 
|---|
| 764 |   } | 
|---|
| 765 |  | 
|---|
| 766 |   /** | 
|---|
| 767 |    * Signify that this batch of input to the filter is finished. | 
|---|
| 768 |    * | 
|---|
| 769 |    * @return true                       if there are instances pending output | 
|---|
| 770 |    * @throws NullPointerException       if no input structure has been defined, | 
|---|
| 771 |    * @throws Exception                  if there was a problem finishing the batch. | 
|---|
| 772 |    */ | 
|---|
| 773 |   public boolean batchFinished() throws Exception { | 
|---|
| 774 |     int         i; | 
|---|
| 775 |     Instances   insts; | 
|---|
| 776 |     Instance    inst; | 
|---|
| 777 |      | 
|---|
| 778 |     if (getInputFormat() == null) | 
|---|
| 779 |       throw new NullPointerException("No input instance format defined"); | 
|---|
| 780 |  | 
|---|
| 781 |     insts = getInputFormat(); | 
|---|
| 782 |  | 
|---|
| 783 |     if (!isFirstBatchDone()) | 
|---|
| 784 |       setup(insts); | 
|---|
| 785 |      | 
|---|
| 786 |     for (i = 0; i < insts.numInstances(); i++) { | 
|---|
| 787 |       inst = convertInstance(insts.instance(i)); | 
|---|
| 788 |       inst.setDataset(getOutputFormat()); | 
|---|
| 789 |       push(inst); | 
|---|
| 790 |     } | 
|---|
| 791 |      | 
|---|
| 792 |     flushInput(); | 
|---|
| 793 |     m_NewBatch       = true; | 
|---|
| 794 |     m_FirstBatchDone = true; | 
|---|
| 795 |      | 
|---|
| 796 |     return (numPendingOutput() != 0); | 
|---|
| 797 |   } | 
|---|
| 798 |    | 
|---|
| 799 |   /** | 
|---|
| 800 |    * Returns the revision string. | 
|---|
| 801 |    *  | 
|---|
| 802 |    * @return            the revision | 
|---|
| 803 |    */ | 
|---|
| 804 |   public String getRevision() { | 
|---|
| 805 |     return RevisionUtils.extract("$Revision: 5987 $"); | 
|---|
| 806 |   } | 
|---|
| 807 |  | 
|---|
| 808 |   /** | 
|---|
| 809 |    * Main method for running this filter. | 
|---|
| 810 |    * | 
|---|
| 811 |    * @param args        should contain arguments to the filter: use -h for help | 
|---|
| 812 |    */ | 
|---|
| 813 |   public static void main(String[] args) { | 
|---|
| 814 |     runFilter(new PrincipalComponents(), args); | 
|---|
| 815 |   } | 
|---|
| 816 | } | 
|---|