source: src/main/java/weka/attributeSelection/PrincipalComponents.java @ 19

Last change on this file since 19 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 30.3 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    PrincipalComponents.java
19 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
20 *
21 */
22
23package weka.attributeSelection;
24
25import java.util.Enumeration;
26import java.util.Vector;
27
28import weka.core.Attribute;
29import weka.core.Capabilities;
30import weka.core.FastVector;
31import weka.core.Instance;
32import weka.core.DenseInstance;
33import weka.core.Instances;
34import weka.core.Matrix;
35import weka.core.Option;
36import weka.core.OptionHandler;
37import weka.core.RevisionUtils;
38import weka.core.SparseInstance;
39import weka.core.Utils;
40import weka.core.Capabilities.Capability;
41import weka.filters.Filter;
42import weka.filters.unsupervised.attribute.NominalToBinary;
43import weka.filters.unsupervised.attribute.Normalize;
44import weka.filters.unsupervised.attribute.Remove;
45import weka.filters.unsupervised.attribute.ReplaceMissingValues;
46
47/**
48 <!-- globalinfo-start -->
49 * Performs a principal components analysis and transformation of the data. Use in conjunction with a Ranker search. Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data---default 0.95 (95%). Attribute noise can be filtered by transforming to the PC space, eliminating some of the worst eigenvectors, and then transforming back to the original space.
50 * <p/>
51 <!-- globalinfo-end -->
52 *
53 <!-- options-start -->
54 * Valid options are: <p/>
55 *
56 * <pre> -D
57 *  Don't normalize input data.</pre>
58 *
59 * <pre> -R
60 *  Retain enough PC attributes to account
61 *  for this proportion of variance in the original data.
62 *  (default = 0.95)</pre>
63 *
64 * <pre> -O
65 *  Transform through the PC space and
66 *  back to the original space.</pre>
67 *
68 * <pre> -A
69 *  Maximum number of attributes to include in
70 *  transformed attribute names. (-1 = include all)</pre>
71 *
72 <!-- options-end -->
73 *
74 * @author Mark Hall (mhall@cs.waikato.ac.nz)
75 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
76 * @version $Revision: 5987 $
77 */
78public class PrincipalComponents 
79  extends UnsupervisedAttributeEvaluator
80  implements AttributeTransformer, OptionHandler {
81 
82  /** for serialization */
83  static final long serialVersionUID = 3310137541055815078L;
84 
85  /** The data to transform analyse/transform */
86  private Instances m_trainInstances;
87
88  /** Keep a copy for the class attribute (if set) */
89  private Instances m_trainHeader;
90
91  /** The header for the transformed data format */
92  private Instances m_transformedFormat;
93
94  /** The header for data transformed back to the original space */
95  private Instances m_originalSpaceFormat;
96
97  /** Data has a class set */
98  private boolean m_hasClass;
99
100  /** Class index */
101  private int m_classIndex;
102
103  /** Number of attributes */
104  private int m_numAttribs;
105
106  /** Number of instances */
107  private int m_numInstances;
108
109  /** Correlation matrix for the original data */
110  private double [][] m_correlation;
111
112  /** Will hold the unordered linear transformations of the (normalized)
113      original data */
114  private double [][] m_eigenvectors;
115 
116  /** Eigenvalues for the corresponding eigenvectors */
117  private double [] m_eigenvalues = null;
118
119  /** Sorted eigenvalues */
120  private int [] m_sortedEigens;
121
122  /** sum of the eigenvalues */
123  private double m_sumOfEigenValues = 0.0;
124 
125  /** Filters for original data */
126  private ReplaceMissingValues m_replaceMissingFilter;
127  private Normalize m_normalizeFilter;
128  private NominalToBinary m_nominalToBinFilter;
129  private Remove m_attributeFilter;
130 
131  /** used to remove the class column if a class column is set */
132  private Remove m_attribFilter;
133
134  /** The number of attributes in the pc transformed data */
135  private int m_outputNumAtts = -1;
136 
137  /** normalize the input data? */
138  private boolean m_normalize = true;
139
140  /** the amount of varaince to cover in the original data when
141      retaining the best n PC's */
142  private double m_coverVariance = 0.95;
143
144  /** transform the data through the pc space and back to the original
145      space ? */
146  private boolean m_transBackToOriginal = false;
147 
148  /** maximum number of attributes in the transformed attribute name */
149  private int m_maxAttrsInName = 5;
150
151  /** holds the transposed eigenvectors for converting back to the
152      original space */
153  private double [][] m_eTranspose;
154
155  /**
156   * Returns a string describing this attribute transformer
157   * @return a description of the evaluator suitable for
158   * displaying in the explorer/experimenter gui
159   */
160  public String globalInfo() {
161    return "Performs a principal components analysis and transformation of "
162      +"the data. Use in conjunction with a Ranker search. Dimensionality "
163      +"reduction is accomplished by choosing enough eigenvectors to "
164      +"account for some percentage of the variance in the original data---"
165      +"default 0.95 (95%). Attribute noise can be filtered by transforming "
166      +"to the PC space, eliminating some of the worst eigenvectors, and "
167      +"then transforming back to the original space.";
168  }
169
170  /**
171   * Returns an enumeration describing the available options. <p>
172   *
173   * @return an enumeration of all the available options.
174   **/
175  public Enumeration listOptions () {
176    Vector newVector = new Vector(3);
177    newVector.addElement(new Option("\tDon't normalize input data." 
178                                    , "D", 0, "-D"));
179
180    newVector.addElement(new Option("\tRetain enough PC attributes to account "
181                                    +"\n\tfor this proportion of variance in "
182                                    +"the original data.\n"
183                                    + "\t(default = 0.95)",
184                                    "R",1,"-R"));
185   
186    newVector.addElement(new Option("\tTransform through the PC space and "
187                                    +"\n\tback to the original space."
188                                    , "O", 0, "-O"));
189                                   
190    newVector.addElement(new Option("\tMaximum number of attributes to include in "
191                                    + "\n\ttransformed attribute names. (-1 = include all)"
192                                    , "A", 1, "-A"));
193    return  newVector.elements();
194  }
195
196  /**
197   * Parses a given list of options. <p/>
198   *
199   <!-- options-start -->
200   * Valid options are: <p/>
201   *
202   * <pre> -D
203   *  Don't normalize input data.</pre>
204   *
205   * <pre> -R
206   *  Retain enough PC attributes to account
207   *  for this proportion of variance in the original data.
208   *  (default = 0.95)</pre>
209   *
210   * <pre> -O
211   *  Transform through the PC space and
212   *  back to the original space.</pre>
213   *
214   * <pre> -A
215   *  Maximum number of attributes to include in
216   *  transformed attribute names. (-1 = include all)</pre>
217   *
218   <!-- options-end -->
219   *
220   * @param options the list of options as an array of strings
221   * @throws Exception if an option is not supported
222   */
223  public void setOptions (String[] options)
224    throws Exception {
225    resetOptions();
226    String optionString;
227
228    optionString = Utils.getOption('R', options);
229    if (optionString.length() != 0) {
230      Double temp;
231      temp = Double.valueOf(optionString);
232      setVarianceCovered(temp.doubleValue());
233    }
234    optionString = Utils.getOption('A', options);
235    if (optionString.length() != 0) {
236      setMaximumAttributeNames(Integer.parseInt(optionString));
237    }
238    setNormalize(!Utils.getFlag('D', options));
239
240    setTransformBackToOriginal(Utils.getFlag('O', options));
241  }
242
243  /**
244   * Reset to defaults
245   */
246  private void resetOptions() {
247    m_coverVariance = 0.95;
248    m_normalize = true;
249    m_sumOfEigenValues = 0.0;
250    m_transBackToOriginal = false;
251  }
252
253  /**
254   * Returns the tip text for this property
255   * @return tip text for this property suitable for
256   * displaying in the explorer/experimenter gui
257   */
258  public String normalizeTipText() {
259    return "Normalize input data.";
260  }
261
262  /**
263   * Set whether input data will be normalized.
264   * @param n true if input data is to be normalized
265   */
266  public void setNormalize(boolean n) {
267    m_normalize = n;
268  }
269
270  /**
271   * Gets whether or not input data is to be normalized
272   * @return true if input data is to be normalized
273   */
274  public boolean getNormalize() {
275    return m_normalize;
276  }
277
278  /**
279   * Returns the tip text for this property
280   * @return tip text for this property suitable for
281   * displaying in the explorer/experimenter gui
282   */
283  public String varianceCoveredTipText() {
284    return "Retain enough PC attributes to account for this proportion of "
285      +"variance.";
286  }
287
288  /**
289   * Sets the amount of variance to account for when retaining
290   * principal components
291   * @param vc the proportion of total variance to account for
292   */
293  public void setVarianceCovered(double vc) {
294    m_coverVariance = vc;
295  }
296
297  /**
298   * Gets the proportion of total variance to account for when
299   * retaining principal components
300   * @return the proportion of variance to account for
301   */
302  public double getVarianceCovered() {
303    return m_coverVariance;
304  }
305
306  /**
307   * Returns the tip text for this property
308   * @return tip text for this property suitable for
309   * displaying in the explorer/experimenter gui
310   */
311  public String maximumAttributeNamesTipText() {
312    return "The maximum number of attributes to include in transformed attribute names.";
313  }
314
315  /**
316   * Sets maximum number of attributes to include in
317   * transformed attribute names.
318   * @param m the maximum number of attributes
319   */
320  public void setMaximumAttributeNames(int m) {
321    m_maxAttrsInName = m;
322  }
323
324  /**
325   * Gets maximum number of attributes to include in
326   * transformed attribute names.
327   * @return the maximum number of attributes
328   */
329  public int getMaximumAttributeNames() {
330    return m_maxAttrsInName;
331  }
332
333  /**
334   * Returns the tip text for this property
335   * @return tip text for this property suitable for
336   * displaying in the explorer/experimenter gui
337   */
338  public String transformBackToOriginalTipText() {
339    return "Transform through the PC space and back to the original space. "
340      +"If only the best n PCs are retained (by setting varianceCovered < 1) "
341      +"then this option will give a dataset in the original space but with "
342      +"less attribute noise.";
343  }
344
345  /**
346   * Sets whether the data should be transformed back to the original
347   * space
348   * @param b true if the data should be transformed back to the
349   * original space
350   */
351  public void setTransformBackToOriginal(boolean b) {
352    m_transBackToOriginal = b;
353  }
354 
355  /**
356   * Gets whether the data is to be transformed back to the original
357   * space.
358   * @return true if the data is to be transformed back to the original space
359   */
360  public boolean getTransformBackToOriginal() {
361    return m_transBackToOriginal;
362  }
363
364  /**
365   * Gets the current settings of PrincipalComponents
366   *
367   * @return an array of strings suitable for passing to setOptions()
368   */
369  public String[] getOptions () {
370
371    String[] options = new String[6];
372    int current = 0;
373
374    if (!getNormalize()) {
375      options[current++] = "-D";
376    }
377
378    options[current++] = "-R";
379    options[current++] = ""+getVarianceCovered();
380
381    options[current++] = "-A";
382    options[current++] = ""+getMaximumAttributeNames();
383
384    if (getTransformBackToOriginal()) {
385      options[current++] = "-O";
386    }
387   
388    while (current < options.length) {
389      options[current++] = "";
390    }
391   
392    return  options;
393  }
394
395  /**
396   * Returns the capabilities of this evaluator.
397   *
398   * @return            the capabilities of this evaluator
399   * @see               Capabilities
400   */
401  public Capabilities getCapabilities() {
402    Capabilities result = super.getCapabilities();
403    result.disableAll();
404   
405    // attributes
406    result.enable(Capability.NOMINAL_ATTRIBUTES);
407    result.enable(Capability.NUMERIC_ATTRIBUTES);
408    result.enable(Capability.DATE_ATTRIBUTES);
409    result.enable(Capability.MISSING_VALUES);
410   
411    // class
412    result.enable(Capability.NOMINAL_CLASS);
413    result.enable(Capability.NUMERIC_CLASS);
414    result.enable(Capability.DATE_CLASS);
415    result.enable(Capability.MISSING_CLASS_VALUES);
416    result.enable(Capability.NO_CLASS);
417   
418    return result;
419  }
420
421  /**
422   * Initializes principal components and performs the analysis
423   * @param data the instances to analyse/transform
424   * @throws Exception if analysis fails
425   */
426  public void buildEvaluator(Instances data) throws Exception {
427    // can evaluator handle data?
428    getCapabilities().testWithFail(data);
429
430    buildAttributeConstructor(data);
431  }
432
433  private void buildAttributeConstructor (Instances data) throws Exception {
434    m_eigenvalues = null;
435    m_outputNumAtts = -1;
436    m_attributeFilter = null;
437    m_nominalToBinFilter = null;
438    m_sumOfEigenValues = 0.0;
439    m_trainInstances = new Instances(data);
440
441    // make a copy of the training data so that we can get the class
442    // column to append to the transformed data (if necessary)
443    m_trainHeader = new Instances(m_trainInstances, 0);
444   
445    m_replaceMissingFilter = new ReplaceMissingValues();
446    m_replaceMissingFilter.setInputFormat(m_trainInstances);
447    m_trainInstances = Filter.useFilter(m_trainInstances, 
448                                        m_replaceMissingFilter);
449
450    if (m_normalize) {
451      m_normalizeFilter = new Normalize();
452      m_normalizeFilter.setInputFormat(m_trainInstances);
453      m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
454    }
455
456    m_nominalToBinFilter = new NominalToBinary();
457    m_nominalToBinFilter.setInputFormat(m_trainInstances);
458    m_trainInstances = Filter.useFilter(m_trainInstances, 
459                                        m_nominalToBinFilter);
460   
461    // delete any attributes with only one distinct value or are all missing
462    Vector deleteCols = new Vector();
463    for (int i=0;i<m_trainInstances.numAttributes();i++) {
464      if (m_trainInstances.numDistinctValues(i) <=1) {
465        deleteCols.addElement(new Integer(i));
466      }
467    }
468
469    if (m_trainInstances.classIndex() >=0) {
470      // get rid of the class column
471      m_hasClass = true;
472      m_classIndex = m_trainInstances.classIndex();
473      deleteCols.addElement(new Integer(m_classIndex));
474    }
475
476    // remove columns from the data if necessary
477    if (deleteCols.size() > 0) {
478      m_attributeFilter = new Remove();
479      int [] todelete = new int [deleteCols.size()];
480      for (int i=0;i<deleteCols.size();i++) {
481        todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
482      }
483      m_attributeFilter.setAttributeIndicesArray(todelete);
484      m_attributeFilter.setInvertSelection(false);
485      m_attributeFilter.setInputFormat(m_trainInstances);
486      m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
487    }
488   
489    // can evaluator handle the processed data ? e.g., enough attributes?
490    getCapabilities().testWithFail(m_trainInstances);
491
492    m_numInstances = m_trainInstances.numInstances();
493    m_numAttribs = m_trainInstances.numAttributes();
494
495    fillCorrelation();
496
497    double [] d = new double[m_numAttribs]; 
498    double [][] v = new double[m_numAttribs][m_numAttribs];
499
500    Matrix corr = new Matrix(m_correlation);
501    corr.eigenvalueDecomposition(v, d);
502    m_eigenvectors = (double [][])v.clone();
503    m_eigenvalues = (double [])d.clone();
504
505    // any eigenvalues less than 0 are not worth anything --- change to 0
506    for (int i = 0; i < m_eigenvalues.length; i++) {
507      if (m_eigenvalues[i] < 0) {
508        m_eigenvalues[i] = 0.0;
509      }
510    }
511    m_sortedEigens = Utils.sort(m_eigenvalues);
512    m_sumOfEigenValues = Utils.sum(m_eigenvalues);
513
514    m_transformedFormat = setOutputFormat();
515    if (m_transBackToOriginal) {
516      m_originalSpaceFormat = setOutputFormatOriginal();
517     
518      // new ordered eigenvector matrix
519      int numVectors = (m_transformedFormat.classIndex() < 0) 
520        ? m_transformedFormat.numAttributes()
521        : m_transformedFormat.numAttributes() - 1;
522
523      double [][] orderedVectors = 
524        new double [m_eigenvectors.length][numVectors + 1];
525     
526      // try converting back to the original space
527      for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
528        for (int j = 0; j < m_numAttribs; j++) {
529          orderedVectors[j][m_numAttribs - i] = 
530            m_eigenvectors[j][m_sortedEigens[i]];
531        }
532      }
533     
534      // transpose the matrix
535      int nr = orderedVectors.length;
536      int nc = orderedVectors[0].length;
537      m_eTranspose = 
538        new double [nc][nr];
539      for (int i = 0; i < nc; i++) {
540        for (int j = 0; j < nr; j++) {
541          m_eTranspose[i][j] = orderedVectors[j][i];
542        }
543      }
544    }
545  }
546
547  /**
548   * Returns just the header for the transformed data (ie. an empty
549   * set of instances. This is so that AttributeSelection can
550   * determine the structure of the transformed data without actually
551   * having to get all the transformed data through transformedData().
552   * @return the header of the transformed data.
553   * @throws Exception if the header of the transformed data can't
554   * be determined.
555   */
556  public Instances transformedHeader() throws Exception {
557    if (m_eigenvalues == null) {
558      throw new Exception("Principal components hasn't been built yet");
559    }
560    if (m_transBackToOriginal) {
561      return m_originalSpaceFormat;
562    } else {
563      return m_transformedFormat;
564    }
565  }
566
567  /**
568   * Gets the transformed training data.
569   * @return the transformed training data
570   * @throws Exception if transformed data can't be returned
571   */
572  public Instances transformedData(Instances data) throws Exception {
573    if (m_eigenvalues == null) {
574      throw new Exception("Principal components hasn't been built yet");
575    }
576
577    Instances output = null;
578
579    if (m_transBackToOriginal) {
580      output = new Instances(m_originalSpaceFormat);
581    } else {
582      output = new Instances(m_transformedFormat);
583    }
584    for (int i = 0; i < data.numInstances(); i++) {
585      Instance converted = convertInstance(data.instance(i));
586      output.add(converted);
587    }
588
589    return output;
590  }
591
592  /**
593   * Evaluates the merit of a transformed attribute. This is defined
594   * to be 1 minus the cumulative variance explained. Merit can't
595   * be meaningfully evaluated if the data is to be transformed back
596   * to the original space.
597   * @param att the attribute to be evaluated
598   * @return the merit of a transformed attribute
599   * @throws Exception if attribute can't be evaluated
600   */
601  public double evaluateAttribute(int att) throws Exception {
602    if (m_eigenvalues == null) {
603      throw new Exception("Principal components hasn't been built yet!");
604    }
605
606    if (m_transBackToOriginal) {
607      return 1.0; // can't evaluate back in the original space!
608    }
609
610    // return 1-cumulative variance explained for this transformed att
611    double cumulative = 0.0;
612    for (int i = m_numAttribs - 1; i >= m_numAttribs - att - 1; i--) {
613      cumulative += m_eigenvalues[m_sortedEigens[i]];
614    }
615
616    return 1.0 - cumulative / m_sumOfEigenValues;
617  }
618
619  /**
620   * Fill the correlation matrix
621   */
622  private void fillCorrelation() {
623    m_correlation = new double[m_numAttribs][m_numAttribs];
624    double [] att1 = new double [m_numInstances];
625    double [] att2 = new double [m_numInstances];
626    double corr;
627
628    for (int i = 0; i < m_numAttribs; i++) {
629      for (int j = 0; j < m_numAttribs; j++) {
630        if (i == j) {
631          m_correlation[i][j] = 1.0;
632        } else {
633          for (int k = 0; k < m_numInstances; k++) {
634            att1[k] = m_trainInstances.instance(k).value(i);
635            att2[k] = m_trainInstances.instance(k).value(j);
636          }
637          corr = Utils.correlation(att1,att2,m_numInstances);
638          m_correlation[i][j] = corr;
639          m_correlation[j][i] = corr;
640        }
641      }
642    }
643  }
644
645  /**
646   * Return a summary of the analysis
647   * @return a summary of the analysis.
648   */
649  private String principalComponentsSummary() {
650    StringBuffer result = new StringBuffer();
651    double cumulative = 0.0;
652    Instances output = null;
653    int numVectors=0;
654
655    try {
656      output = setOutputFormat();
657      numVectors = (output.classIndex() < 0) 
658        ? output.numAttributes()
659        : output.numAttributes()-1;
660    } catch (Exception ex) {
661    }
662    //tomorrow
663    result.append("Correlation matrix\n"+matrixToString(m_correlation)
664                  +"\n\n");
665    result.append("eigenvalue\tproportion\tcumulative\n");
666    for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
667      cumulative+=m_eigenvalues[m_sortedEigens[i]];
668      result.append(Utils.doubleToString(m_eigenvalues[m_sortedEigens[i]],9,5)
669                    +"\t"+Utils.
670                    doubleToString((m_eigenvalues[m_sortedEigens[i]] / 
671                                    m_sumOfEigenValues),
672                                     9,5)
673                    +"\t"+Utils.doubleToString((cumulative / 
674                                                m_sumOfEigenValues),9,5)
675                    +"\t"+output.attribute(m_numAttribs - i - 1).name()+"\n");
676    }
677
678    result.append("\nEigenvectors\n");
679    for (int j = 1;j <= numVectors;j++) {
680      result.append(" V"+j+'\t');
681    }
682    result.append("\n");
683    for (int j = 0; j < m_numAttribs; j++) {
684
685      for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
686        result.append(Utils.
687                      doubleToString(m_eigenvectors[j][m_sortedEigens[i]],7,4)
688                      +"\t");
689      }
690      result.append(m_trainInstances.attribute(j).name()+'\n');
691    }
692
693    if (m_transBackToOriginal) {
694      result.append("\nPC space transformed back to original space.\n"
695                    +"(Note: can't evaluate attributes in the original "
696                    +"space)\n");
697    }
698    return result.toString();
699  }
700
701  /**
702   * Returns a description of this attribute transformer
703   * @return a String describing this attribute transformer
704   */
705  public String toString() {
706    if (m_eigenvalues == null) {
707      return "Principal components hasn't been built yet!";
708    } else {
709      return "\tPrincipal Components Attribute Transformer\n\n"
710        +principalComponentsSummary();
711    }
712  }
713
714  /**
715   * Return a matrix as a String
716   * @param matrix that is decribed as a string
717   * @return a String describing a matrix
718   */
719  private String matrixToString(double [][] matrix) {
720    StringBuffer result = new StringBuffer();
721    int last = matrix.length - 1;
722
723    for (int i = 0; i <= last; i++) {
724      for (int j = 0; j <= last; j++) {
725        result.append(Utils.doubleToString(matrix[i][j],6,2)+" ");
726        if (j == last) {
727          result.append('\n');
728        }
729      }
730    }
731    return result.toString();
732  }
733
734  /**
735   * Convert a pc transformed instance back to the original space
736   *
737   * @param inst        the instance to convert
738   * @return            the processed instance
739   * @throws Exception  if something goes wrong
740   */
741  private Instance convertInstanceToOriginal(Instance inst)
742    throws Exception {
743    double[] newVals = null;
744
745    if (m_hasClass) {
746      newVals = new double[m_numAttribs+1];
747    } else {
748      newVals = new double[m_numAttribs];
749    }
750
751    if (m_hasClass) {
752      // class is always appended as the last attribute
753      newVals[m_numAttribs] = inst.value(inst.numAttributes() - 1);
754    }
755
756    for (int i = 0; i < m_eTranspose[0].length; i++) {
757      double tempval = 0.0;
758      for (int j = 1; j < m_eTranspose.length; j++) {
759        tempval += (m_eTranspose[j][i] * 
760                    inst.value(j - 1));
761       }
762      newVals[i] = tempval;
763    }
764   
765    if (inst instanceof SparseInstance) {
766      return new SparseInstance(inst.weight(), newVals);
767    } else {
768      return new DenseInstance(inst.weight(), newVals);
769    }     
770  }
771
772  /**
773   * Transform an instance in original (unormalized) format. Convert back
774   * to the original space if requested.
775   * @param instance an instance in the original (unormalized) format
776   * @return a transformed instance
777   * @throws Exception if instance cant be transformed
778   */
779  public Instance convertInstance(Instance instance) throws Exception {
780
781    if (m_eigenvalues == null) {
782      throw new Exception("convertInstance: Principal components not "
783                          +"built yet");
784    }
785
786    double[] newVals = new double[m_outputNumAtts];
787    Instance tempInst = (Instance)instance.copy();
788    if (!instance.dataset().equalHeaders(m_trainHeader)) {
789      throw new Exception("Can't convert instance: header's don't match: "
790                          +"PrincipalComponents\n"
791                          + instance.dataset().equalHeadersMsg(m_trainHeader));
792    }
793
794    m_replaceMissingFilter.input(tempInst);
795    m_replaceMissingFilter.batchFinished();
796    tempInst = m_replaceMissingFilter.output();
797
798    if (m_normalize) {
799      m_normalizeFilter.input(tempInst);
800      m_normalizeFilter.batchFinished();
801      tempInst = m_normalizeFilter.output();
802    }
803
804    m_nominalToBinFilter.input(tempInst);
805    m_nominalToBinFilter.batchFinished();
806    tempInst = m_nominalToBinFilter.output();
807
808    if (m_attributeFilter != null) {
809      m_attributeFilter.input(tempInst);
810      m_attributeFilter.batchFinished();
811      tempInst = m_attributeFilter.output();
812    }
813
814    if (m_hasClass) {
815       newVals[m_outputNumAtts - 1] = instance.value(instance.classIndex());
816    }
817
818    double cumulative = 0;
819    for (int i = m_numAttribs - 1; i >= 0; i--) {
820      double tempval = 0.0;
821      for (int j = 0; j < m_numAttribs; j++) {
822        tempval += (m_eigenvectors[j][m_sortedEigens[i]] * 
823                    tempInst.value(j));
824       }
825      newVals[m_numAttribs - i - 1] = tempval;
826      cumulative+=m_eigenvalues[m_sortedEigens[i]];
827      if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) {
828        break;
829      }
830    }
831   
832    if (!m_transBackToOriginal) {
833      if (instance instanceof SparseInstance) {
834      return new SparseInstance(instance.weight(), newVals);
835      } else {
836        return new DenseInstance(instance.weight(), newVals);
837      }     
838    } else {
839      if (instance instanceof SparseInstance) {
840        return convertInstanceToOriginal(new SparseInstance(instance.weight(), 
841                                                            newVals));
842      } else {
843        return convertInstanceToOriginal(new DenseInstance(instance.weight(),
844                                                      newVals));
845      }
846    }
847  }
848
849  /**
850   * Set up the header for the PC->original space dataset
851   *
852   * @return            the output format
853   * @throws Exception  if something goes wrong
854   */
855  private Instances setOutputFormatOriginal() throws Exception {
856    FastVector attributes = new FastVector();
857   
858    for (int i = 0; i < m_numAttribs; i++) {
859      String att = m_trainInstances.attribute(i).name();
860      attributes.addElement(new Attribute(att));
861    }
862   
863    if (m_hasClass) {
864      attributes.addElement(m_trainHeader.classAttribute().copy());
865    }
866
867    Instances outputFormat = 
868      new Instances(m_trainHeader.relationName()+"->PC->original space",
869                    attributes, 0);
870   
871    // set the class to be the last attribute if necessary
872    if (m_hasClass) {
873      outputFormat.setClassIndex(outputFormat.numAttributes()-1);
874    }
875
876    return outputFormat;
877  }
878
879  /**
880   * Set the format for the transformed data
881   * @return a set of empty Instances (header only) in the new format
882   * @throws Exception if the output format can't be set
883   */
884  private Instances setOutputFormat() throws Exception {
885    if (m_eigenvalues == null) {
886      return null;
887    }
888
889    double cumulative = 0.0;
890    FastVector attributes = new FastVector();
891     for (int i = m_numAttribs - 1; i >= 0; i--) {
892       StringBuffer attName = new StringBuffer();
893       // build array of coefficients
894       double[] coeff_mags = new double[m_numAttribs];
895       for (int j = 0; j < m_numAttribs; j++)
896         coeff_mags[j] = -Math.abs(m_eigenvectors[j][m_sortedEigens[i]]);
897       int num_attrs = (m_maxAttrsInName > 0) ? Math.min(m_numAttribs, m_maxAttrsInName) : m_numAttribs;
898       // this array contains the sorted indices of the coefficients
899       int[] coeff_inds;
900       if (m_numAttribs > 0) {
901          // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude
902          coeff_inds = Utils.sort(coeff_mags);
903       } else {
904          // if  m_maxAttrsInName <= 0, use all coeffs in original order
905          coeff_inds = new int[m_numAttribs];
906          for (int j=0; j<m_numAttribs; j++)
907            coeff_inds[j] = j;
908       }
909       // build final attName string
910       for (int j = 0; j < num_attrs; j++) {
911         double coeff_value = m_eigenvectors[coeff_inds[j]][m_sortedEigens[i]];
912         if (j > 0 && coeff_value >= 0)
913           attName.append("+");
914         attName.append(Utils.doubleToString(coeff_value,5,3)
915                        +m_trainInstances.attribute(coeff_inds[j]).name());
916       }
917       if (num_attrs < m_numAttribs)
918         attName.append("...");
919         
920       attributes.addElement(new Attribute(attName.toString()));
921       cumulative+=m_eigenvalues[m_sortedEigens[i]];
922
923       if ((cumulative / m_sumOfEigenValues) >= m_coverVariance) {
924         break;
925       }
926     }
927     
928     if (m_hasClass) {
929       attributes.addElement(m_trainHeader.classAttribute().copy());
930     }
931
932     Instances outputFormat = 
933       new Instances(m_trainInstances.relationName()+"_principal components",
934                     attributes, 0);
935
936     // set the class to be the last attribute if necessary
937     if (m_hasClass) {
938       outputFormat.setClassIndex(outputFormat.numAttributes()-1);
939     }
940     
941     m_outputNumAtts = outputFormat.numAttributes();
942     return outputFormat;
943  }
944 
945  /**
946   * Returns the revision string.
947   *
948   * @return            the revision
949   */
950  public String getRevision() {
951    return RevisionUtils.extract("$Revision: 5987 $");
952  }
953
954  /**
955   * Main method for testing this class
956   * @param argv should contain the command line arguments to the
957   * evaluator/transformer (see AttributeSelection)
958   */
959  public static void main(String [] argv) {
960    runEvaluator(new PrincipalComponents(), argv);
961  }
962}
Note: See TracBrowser for help on using the repository browser.