source: src/main/java/weka/attributeSelection/LatentSemanticAnalysis.java @ 20

Last change on this file since 20 was 4, checked in by gnappo, 14 years ago

Import di weka.

File size: 27.6 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 *    LatentSemanticAnalysis.java
19 *    Copyright (C) 2008 Amri Napolitano
20 *
21 */
22
23package weka.attributeSelection;
24
25import weka.core.Attribute;
26import weka.core.Capabilities;
27import weka.core.Check;
28import weka.core.CheckOptionHandler;
29import weka.core.FastVector;
30import weka.core.Instance;
31import weka.core.DenseInstance;
32import weka.core.Instances;
33import weka.core.matrix.Matrix;
34import weka.core.Option;
35import weka.core.OptionHandler;
36import weka.core.RevisionUtils;
37import weka.core.SparseInstance;
38import weka.core.Utils;
39import weka.core.Capabilities.Capability;
40import weka.core.matrix.SingularValueDecomposition;
41import weka.filters.Filter;
42import weka.filters.unsupervised.attribute.NominalToBinary;
43import weka.filters.unsupervised.attribute.Normalize;
44import weka.filters.unsupervised.attribute.Remove;
45import weka.filters.unsupervised.attribute.ReplaceMissingValues;
46
47import java.io.BufferedReader;
48import java.io.File;
49import java.io.FileReader;
50import java.util.Enumeration;
51import java.util.Vector;
52
53/**
54 <!-- globalinfo-start -->
55 * Performs latent semantic analysis and transformation of the data.
56 * Use in conjunction with a Ranker search. A low-rank approximation
57 * of the full data is found by specifying the number of singular values
58 * to use. The dataset may be transformed to give the relation of either
59 * the attributes or the instances (default) to the concept space created
60 * by the transformation.
61 * <p/>
62 <!-- globalinfo-end -->
63 *
64 <!-- options-start -->
65 * Valid options are: <p/>
66 *
67 * <pre> -N
68 *  Normalize input data.</pre>
69 *
70 * <pre> -R
71 *  Rank approximation used in LSA. May be actual number of
72 *  LSA attributes to include (if greater than 1) or a proportion
73 *  of total singular values to account for (if between 0 and 1).
74 *  A value less than or equal to zero means use all latent variables.
75 *  (default = 0.95)</pre>
76 *
77 * <pre> -A
78 *  Maximum number of attributes to include in
79 *  transformed attribute names. (-1 = include all)</pre>
80 *
81 <!-- options-end -->
82 *
83 * @author Amri Napolitano
84 * @version $Revision: 5987 $
85 */
86
87public class LatentSemanticAnalysis 
88extends UnsupervisedAttributeEvaluator
89implements AttributeTransformer, OptionHandler {
90 
91  /** For serialization */
92  static final long serialVersionUID = -8712112988018106198L;
93 
94  /** The data to transform analyse/transform */
95  private Instances m_trainInstances;
96 
97  /**
98   * Keep a copy for the class attribute (if set) and for
99   * checking for header compatibility
100   */
101  private Instances m_trainHeader;
102 
103  /** The header for the transformed data format */
104  private Instances m_transformedFormat;
105 
106  /** Data has a class set */
107  private boolean m_hasClass;
108 
109  /** Class index */
110  private int m_classIndex;
111 
112  /** Number of attributes */
113  private int m_numAttributes;
114 
115  /** Number of instances */
116  private int m_numInstances;
117 
118  /** Is transpose necessary because numAttributes < numInstances? */
119  private boolean m_transpose = false;
120 
121  /** Will hold the left singular vectors */
122  private Matrix m_u = null;
123 
124  /** Will hold the singular values */
125  private Matrix m_s = null;
126 
127  /** Will hold the right singular values */
128  private Matrix m_v = null;
129 
130  /** Will hold the matrix used to transform instances to the new feature space */
131  private Matrix m_transformationMatrix = null;
132 
133  /** Filters for original data */
134  private ReplaceMissingValues m_replaceMissingFilter;
135  private Normalize m_normalizeFilter;
136  private NominalToBinary m_nominalToBinaryFilter;
137  private Remove m_attributeFilter;
138 
139  /** The number of attributes in the LSA transformed data */
140  private int m_outputNumAttributes = -1;
141 
142  /** Normalize the input data? */
143  private boolean m_normalize = false;
144 
145  /** The approximation rank to use (between 0 and 1 means coverage proportion) */
146  private double m_rank = 0.95;
147 
148  /** The sum of the squares of the singular values */
149  private double m_sumSquaredSingularValues = 0.0;
150 
151  /** The actual rank number to use for computation */
152  private int m_actualRank = -1;
153 
154  /** Maximum number of attributes in the transformed attribute name */
155  private int m_maxAttributesInName = 5;
156 
157  /**
158   * Returns a string describing this attribute transformer
159   * @return a description of the evaluator suitable for
160   * displaying in the explorer/experimenter gui
161   */
162  public String globalInfo() {
163    return "Performs latent semantic analysis and transformation of the data. Use in " +
164            "conjunction with a Ranker search. A low-rank approximation of the full data is " +
165            "found by either specifying the number of singular values to use or specifying a " +
166            "proportion of the singular values to cover.";
167  }
168 
169  /**
170   * Returns an enumeration describing the available options. <p>
171   *
172   * @return an enumeration of all the available options.
173   **/
174  public Enumeration listOptions () {
175    Vector options = new Vector(4);
176    options.addElement(new Option("\tNormalize input data.", "N", 0, "-N"));
177   
178    options.addElement(new Option("\tRank approximation used in LSA. \n" +
179                                   "\tMay be actual number of LSA attributes \n" +
180                                   "\tto include (if greater than 1) or a \n" +
181                                   "\tproportion of total singular values to \n" +
182                                   "\taccount for (if between 0 and 1). \n" +
183                                   "\tA value less than or equal to zero means \n" +
184                                   "\tuse all latent variables.(default = 0.95)",
185                                   "R",1,"-R"));
186   
187    options.addElement(new Option("\tMaximum number of attributes to include\n" +
188                                   "\tin transformed attribute names.\n" +
189                                   "\t(-1 = include all)"
190                                   , "A", 1, "-A"));
191    return  options.elements();
192  }
193 
194  /**
195   * Parses a given list of options. <p/>
196   *
197   <!-- options-start -->
198   * Valid options are: <p/>
199   *
200   * <pre> -N
201   *  Normalize input data.</pre>
202   *
203   * <pre> -R
204   *  Rank approximation used in LSA. May be actual number of
205   *  LSA attributes to include (if greater than 1) or a proportion
206   *  of total singular values to account for (if between 0 and 1).
207   *  A value less than or equal to zero means use all latent variables.
208   *  (default = 0.95)</pre>
209   *
210   * <pre> -A
211   *  Maximum number of attributes to include in
212   *  transformed attribute names. (-1 = include all)</pre>
213   *
214   <!-- options-end -->
215   *
216   * @param options the list of options as an array of strings
217   * @throws Exception if an option is not supported
218   */
219  public void setOptions (String[] options)
220  throws Exception {
221    resetOptions();
222    String optionString;
223   
224    //set approximation rank
225    optionString = Utils.getOption('R', options);
226    if (optionString.length() != 0) {
227      double temp;
228      temp = Double.valueOf(optionString).doubleValue();
229      setRank(temp);
230    }
231   
232    //set number of attributes to use in transformed names
233    optionString = Utils.getOption('A', options);
234    if (optionString.length() != 0) {
235      setMaximumAttributeNames(Integer.parseInt(optionString));
236    }
237   
238    //set normalize option
239    setNormalize(Utils.getFlag('N', options));
240  }
241 
242  /**
243   * Reset to defaults
244   */
245  private void resetOptions() {
246    m_rank = 0.95;
247    m_normalize = true;
248    m_maxAttributesInName = 5;
249  }
250 
251  /**
252   * Returns the tip text for this property
253   * @return tip text for this property suitable for
254   * displaying in the explorer/experimenter gui
255   */
256  public String normalizeTipText() {
257    return "Normalize input data.";
258  }
259 
260  /**
261   * Set whether input data will be normalized.
262   * @param newNormalize true if input data is to be normalized
263   */
264  public void setNormalize(boolean newNormalize) {
265    m_normalize = newNormalize;
266  }
267 
268  /**
269   * Gets whether or not input data is to be normalized
270   * @return true if input data is to be normalized
271   */
272  public boolean getNormalize() {
273    return m_normalize;
274  }
275 
276  /**
277   * Returns the tip text for this property
278   * @return tip text for this property suitable for
279   * displaying in the explorer/experimenter gui
280   */
281  public String rankTipText() {
282    return "Matrix rank to use for data reduction. Can be a" +
283    " proportion to indicate desired coverage";
284  }
285 
286  /**
287   * Sets the desired matrix rank (or coverage proportion) for feature-space reduction
288   * @param newRank the desired rank (or coverage) for feature-space reduction
289   */
290  public void setRank(double newRank) {
291      m_rank = newRank;
292  }
293 
294  /**
295   * Gets the desired matrix rank (or coverage proportion) for feature-space reduction
296   * @return the rank (or coverage) for feature-space reduction
297   */
298  public double getRank() {
299    return m_rank;
300  }
301 
302  /**
303   * Returns the tip text for this property
304   * @return tip text for this property suitable for
305   * displaying in the explorer/experimenter gui
306   */
307  public String maximumAttributeNamesTipText() {
308    return "The maximum number of attributes to include in transformed attribute names.";
309  }
310 
311  /**
312   * Sets maximum number of attributes to include in
313   * transformed attribute names.
314   * @param newMaxAttributes the maximum number of attributes
315   */
316  public void setMaximumAttributeNames(int newMaxAttributes) {
317    m_maxAttributesInName = newMaxAttributes;
318  }
319 
320  /**
321   * Gets maximum number of attributes to include in
322   * transformed attribute names.
323   * @return the maximum number of attributes
324   */
325  public int getMaximumAttributeNames() {
326    return m_maxAttributesInName;
327  }
328 
329  /**
330   * Gets the current settings of LatentSemanticAnalysis
331   *
332   * @return an array of strings suitable for passing to setOptions()
333   */
334  public String[] getOptions () {
335   
336    String[] options = new String[5];
337    int current = 0;
338   
339    if (getNormalize()) {
340      options[current++] = "-N";
341    }
342   
343    options[current++] = "-R";
344    options[current++] = "" + getRank();
345   
346    options[current++] = "-A";
347    options[current++] = "" + getMaximumAttributeNames();
348   
349    while (current < options.length) {
350      options[current++] = "";
351    }
352   
353    return  options;
354  }
355 
356  /**
357   * Returns the capabilities of this evaluator.
358   *
359   * @return            the capabilities of this evaluator
360   * @see               Capabilities
361   */
362  public Capabilities getCapabilities() {
363    Capabilities result = super.getCapabilities();
364    result.disableAll();
365   
366    // attributes
367    result.enable(Capability.NOMINAL_ATTRIBUTES);
368    result.enable(Capability.NUMERIC_ATTRIBUTES);
369    result.enable(Capability.DATE_ATTRIBUTES);
370    result.enable(Capability.MISSING_VALUES);
371   
372    // class
373    result.enable(Capability.NOMINAL_CLASS);
374    result.enable(Capability.NUMERIC_CLASS);
375    result.enable(Capability.DATE_CLASS);
376    result.enable(Capability.MISSING_CLASS_VALUES);
377    result.enable(Capability.NO_CLASS);
378   
379    return result;
380  }
381 
382  /**
383   * Initializes the singular values/vectors and performs the analysis
384   * @param data the instances to analyse/transform
385   * @throws Exception if analysis fails
386   */
387  public void buildEvaluator(Instances data) throws Exception {
388    // can evaluator handle data?
389    getCapabilities().testWithFail(data);
390   
391    buildAttributeConstructor(data);
392  }
393 
394  /**
395   * Initializes the singular values/vectors and performs the analysis
396   * @param data the instances to analyse/transform
397   * @throws Exception if analysis fails
398   */
399  private void buildAttributeConstructor (Instances data) throws Exception {
400    // initialize attributes for performing analysis
401    m_transpose = false;
402    m_s = null;
403    m_u = null;
404    m_v = null;
405    m_outputNumAttributes = -1;
406    m_actualRank = -1;
407    m_sumSquaredSingularValues = 0.0;
408   
409    m_trainInstances = new Instances(data);
410    m_trainHeader = null;
411   
412    m_attributeFilter = null;
413    m_nominalToBinaryFilter = null;
414   
415    m_replaceMissingFilter = new ReplaceMissingValues();
416    m_replaceMissingFilter.setInputFormat(m_trainInstances);
417    m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter);
418   
419    // vector to hold indices of attributes to delete (class attribute,
420    // attributes that are all missing, or attributes with one distinct value)
421    Vector attributesToRemove = new Vector();
422   
423    // if data has a class attribute
424    if (m_trainInstances.classIndex() >= 0) {
425     
426      m_hasClass = true;
427      m_classIndex = m_trainInstances.classIndex();
428     
429      // set class attribute to be removed
430      attributesToRemove.addElement(new Integer(m_classIndex));
431    }
432    // make copy of training data so the class values (if set) can be appended to final
433    // transformed instances and so that we can check header compatibility
434    m_trainHeader = new Instances(m_trainInstances, 0);
435   
436    // normalize data if desired
437    if (m_normalize) {
438      m_normalizeFilter = new Normalize();
439      m_normalizeFilter.setInputFormat(m_trainInstances);
440      m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
441    }
442   
443    // convert any nominal attributes to binary numeric attributes
444    m_nominalToBinaryFilter = new NominalToBinary();
445    m_nominalToBinaryFilter.setInputFormat(m_trainInstances);
446    m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinaryFilter);
447   
448    // delete any attributes with only one distinct value or are all missing
449    for (int i = 0; i < m_trainInstances.numAttributes(); i++) {
450      if (m_trainInstances.numDistinctValues(i) <= 1) {
451        attributesToRemove.addElement(new Integer(i));
452      }
453    }
454   
455    // remove columns from the data if necessary
456    if (attributesToRemove.size() > 0) {
457      m_attributeFilter = new Remove();
458      int [] todelete = new int[attributesToRemove.size()];
459      for (int i = 0; i < attributesToRemove.size(); i++) {
460        todelete[i] = ((Integer)(attributesToRemove.elementAt(i))).intValue();
461      }
462      m_attributeFilter.setAttributeIndicesArray(todelete);
463      m_attributeFilter.setInvertSelection(false);
464      m_attributeFilter.setInputFormat(m_trainInstances);
465      m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
466    }
467   
468    // can evaluator handle the processed data ? e.g., enough attributes?
469    getCapabilities().testWithFail(m_trainInstances);
470   
471    // record properties of final, ready-to-process data
472    m_numInstances = m_trainInstances.numInstances();
473    m_numAttributes = m_trainInstances.numAttributes();
474   
475    // create matrix of attribute values and compute singular value decomposition
476    double [][] trainValues = new double[m_numAttributes][m_numInstances];
477    for (int i = 0; i < m_numAttributes; i++) {
478      trainValues[i] = m_trainInstances.attributeToDoubleArray(i);
479    }
480    Matrix trainMatrix = new Matrix(trainValues);
481    // svd requires rows >= columns, so transpose data if necessary
482    if (m_numAttributes < m_numInstances) {
483      m_transpose = true;
484      trainMatrix = trainMatrix.transpose();
485    }
486    SingularValueDecomposition trainSVD = trainMatrix.svd();
487    m_u = trainSVD.getU(); // left singular vectors
488    m_s = trainSVD.getS(); // singular values
489    m_v = trainSVD.getV(); // right singular vectors
490   
491    // find actual rank to use
492    int maxSingularValues = trainSVD.rank();
493    for (int i = 0; i < m_s.getRowDimension(); i++) {
494      m_sumSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
495    }
496    if (maxSingularValues == 0) { // no nonzero singular values (shouldn't happen)
497      // reset values from computation
498      m_s = null;
499      m_u = null;
500      m_v = null;
501      m_sumSquaredSingularValues = 0.0;
502     
503      throw new Exception("SVD computation produced no non-zero singular values.");
504    }
505    if (m_rank > maxSingularValues || m_rank <= 0) { // adjust rank if too high or too low
506      m_actualRank = maxSingularValues;
507    } else if (m_rank < 1.0) { // determine how many singular values to include for desired coverage
508      double currentSumOfSquaredSingularValues = 0.0;
509      for (int i = 0; i < m_s.getRowDimension() && m_actualRank == -1; i++) {
510        currentSumOfSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
511        if (currentSumOfSquaredSingularValues / m_sumSquaredSingularValues >= m_rank) {
512          m_actualRank = i + 1;
513        }
514      }
515    } else {
516      m_actualRank = (int) m_rank;
517    }
518   
519    // lower matrix ranks, adjust for transposition (if necessary), and
520    // compute matrix for transforming future instances
521    if (m_transpose) {
522      Matrix tempMatrix = m_u;
523      m_u = m_v;
524      m_v = tempMatrix;
525    }
526    m_u = m_u.getMatrix(0, m_u.getRowDimension() - 1, 0, m_actualRank - 1);
527    m_s = m_s.getMatrix(0, m_actualRank - 1, 0, m_actualRank - 1);
528    m_v = m_v.getMatrix(0, m_v.getRowDimension() - 1, 0, m_actualRank - 1);
529    m_transformationMatrix = m_u.times(m_s.inverse());
530   
531    //create dataset header for transformed instances
532    m_transformedFormat = setOutputFormat();
533  }
534 
535  /**
536   * Set the format for the transformed data
537   * @return a set of empty Instances (header only) in the new format
538   */
539  private Instances setOutputFormat() {
540    // if analysis hasn't been performed (successfully) yet
541    if (m_s == null) {
542      return null;
543    }
544   
545    // set up transformed attributes
546    if (m_hasClass) {
547      m_outputNumAttributes = m_actualRank + 1;
548    } else {
549      m_outputNumAttributes = m_actualRank;
550    }
551    int numAttributesInName = m_maxAttributesInName;
552    if (numAttributesInName <= 0 || numAttributesInName >= m_numAttributes) {
553      numAttributesInName = m_numAttributes;
554    }
555    FastVector attributes = new FastVector(m_outputNumAttributes);
556    for (int i = 0; i < m_actualRank; i++) {
557      // create attribute name
558      String attributeName = "";
559      double [] attributeCoefficients = 
560        m_transformationMatrix.getMatrix(0, m_numAttributes - 1, i, i).getColumnPackedCopy();
561      for (int j = 0; j < numAttributesInName; j++) {
562        if (j > 0) {
563          attributeName += "+";
564        }
565        attributeName += Utils.doubleToString(attributeCoefficients[j], 5, 3);
566        attributeName += m_trainInstances.attribute(j).name();
567      }
568      if (numAttributesInName < m_numAttributes) {
569        attributeName += "...";
570      }
571      // add attribute
572      attributes.addElement(new Attribute(attributeName));
573    }
574    // add original class attribute if present
575    if (m_hasClass) {
576      attributes.addElement(m_trainHeader.classAttribute().copy());
577    }
578    // create blank header
579    Instances outputFormat = new Instances(m_trainInstances.relationName() + "_LSA", 
580        attributes, 0);
581    m_outputNumAttributes = outputFormat.numAttributes();
582    // set class attribute if applicable
583    if (m_hasClass) {
584      outputFormat.setClassIndex(m_outputNumAttributes - 1);
585    }
586   
587    return outputFormat;
588  }
589 
590  /**
591   * Returns just the header for the transformed data (ie. an empty
592   * set of instances. This is so that AttributeSelection can
593   * determine the structure of the transformed data without actually
594   * having to get all the transformed data through getTransformedData().
595   * @return the header of the transformed data.
596   * @throws Exception if the header of the transformed data can't
597   * be determined.
598   */
599  public Instances transformedHeader() throws Exception {
600    if (m_s == null) {
601      throw new Exception("Latent Semantic Analysis hasn't been successfully performed.");
602    }
603    return m_transformedFormat;
604  }
605 
606  /**
607   * Transform the supplied data set (assumed to be the same format
608   * as the training data)
609   * @return the transformed training data
610   * @throws Exception if transformed data can't be returned
611   */
612  public Instances transformedData(Instances data) throws Exception {
613    if (m_s == null) {
614      throw new Exception("Latent Semantic Analysis hasn't been built yet");
615    }
616   
617    Instances output = new Instances(m_transformedFormat, m_numInstances);
618   
619    // the transformed version of instance i from the training data
620    // is stored as the i'th row vector in v (the right singular vectors)
621    for (int i = 0; i < data.numInstances(); i++) {
622      Instance currentInstance = data.instance(i);
623      // record attribute values for converted instance
624      double [] newValues = new double[m_outputNumAttributes];
625      for (int j = 0; j < m_actualRank; j++) { // fill in values from v
626        newValues[j] = m_v.get(i, j);
627      }
628      if (m_hasClass) { // copy class value if applicable
629        newValues[m_outputNumAttributes - 1] = currentInstance.classValue();
630      }
631      //create new instance with recorded values and add to output dataset
632      Instance newInstance;
633      if (currentInstance instanceof SparseInstance) {
634        newInstance = new SparseInstance(currentInstance.weight(), newValues);
635      } else {
636        newInstance = new DenseInstance(currentInstance.weight(), newValues);
637      }
638      output.add(newInstance);
639    }
640   
641    return output;
642  }
643 
644  /**
645   * Evaluates the merit of a transformed attribute. This is defined
646   * to be the square of the singular value for the latent variable
647   * corresponding to the transformed attribute.
648   * @param att the attribute to be evaluated
649   * @return the merit of a transformed attribute
650   * @throws Exception if attribute can't be evaluated
651   */
652  public double evaluateAttribute(int att) throws Exception {
653    if (m_s == null) {
654      throw new Exception("Latent Semantic Analysis hasn't been successfully" +
655                            " performed yet!");
656    }
657   
658    //return the square of the corresponding singular value
659    return (m_s.get(att, att) * m_s.get(att, att)) / m_sumSquaredSingularValues;
660  }
661 
662  /**
663   * Transform an instance in original (unnormalized) format
664   * @param instance an instance in the original (unnormalized) format
665   * @return a transformed instance
666   * @throws Exception if instance can't be transformed
667   */
668  public Instance convertInstance(Instance instance) throws Exception {
669    if (m_s == null) {
670      throw new Exception("convertInstance: Latent Semantic Analysis not " +
671                           "performed yet.");
672    }
673   
674    // array to hold new attribute values
675    double [] newValues = new double[m_outputNumAttributes];
676   
677    // apply filters so new instance is in same format as training instances
678    Instance tempInstance = (Instance)instance.copy();
679    if (!instance.dataset().equalHeaders(m_trainHeader)) {
680      throw new Exception("Can't convert instance: headers don't match: " +
681      "LatentSemanticAnalysis\n" + instance.dataset().equalHeadersMsg(m_trainHeader));
682    }
683    // replace missing values
684    m_replaceMissingFilter.input(tempInstance);
685    m_replaceMissingFilter.batchFinished();
686    tempInstance = m_replaceMissingFilter.output();
687    // normalize
688    if (m_normalize) {
689      m_normalizeFilter.input(tempInstance);
690      m_normalizeFilter.batchFinished();
691      tempInstance = m_normalizeFilter.output();
692    }
693    // convert nominal attributes to binary
694    m_nominalToBinaryFilter.input(tempInstance);
695    m_nominalToBinaryFilter.batchFinished();
696    tempInstance = m_nominalToBinaryFilter.output();
697    // remove class/other attributes
698    if (m_attributeFilter != null) {
699      m_attributeFilter.input(tempInstance);
700      m_attributeFilter.batchFinished();
701      tempInstance = m_attributeFilter.output();
702    }
703   
704    // record new attribute values
705    if (m_hasClass) { // copy class value
706      newValues[m_outputNumAttributes - 1] = instance.classValue();
707    }
708    double [][] oldInstanceValues = new double[1][m_numAttributes];
709    oldInstanceValues[0] = tempInstance.toDoubleArray();
710    Matrix instanceVector = new Matrix(oldInstanceValues); // old attribute values
711    instanceVector = instanceVector.times(m_transformationMatrix); // new attribute values
712    for (int i = 0; i < m_actualRank; i++) {
713      newValues[i] = instanceVector.get(0, i);
714    }
715   
716    // return newly transformed instance
717    if (instance instanceof SparseInstance) {
718      return new SparseInstance(instance.weight(), newValues);
719    } else {
720      return new DenseInstance(instance.weight(), newValues);
721    }
722  }
723 
724  /**
725   * Returns a description of this attribute transformer
726   * @return a String describing this attribute transformer
727   */
728  public String toString() {
729    if (m_s == null) {
730      return "Latent Semantic Analysis hasn't been built yet!";
731    } else {
732      return "\tLatent Semantic Analysis Attribute Transformer\n\n"
733      + lsaSummary();
734    }
735  }
736 
737  /**
738   * Return a summary of the analysis
739   * @return a summary of the analysis.
740   */
741  private String lsaSummary() {
742    StringBuffer result = new StringBuffer();
743   
744    // print number of latent variables used
745    result.append("Number of latent variables utilized: " + m_actualRank);
746   
747    // print singular values
748    result.append("\n\nSingularValue\tLatentVariable#\n");
749    // create single array of singular values rather than diagonal matrix
750    for (int i = 0; i < m_actualRank; i++) {
751      result.append(Utils.doubleToString(m_s.get(i, i), 9, 5) + "\t" + (i + 1) + "\n");
752    }
753   
754    // print attribute vectors
755    result.append("\nAttribute vectors (left singular vectors) -- row vectors show\n" +
756                  "the relation between the original attributes and the latent \n" +
757                  "variables computed by the singular value decomposition:\n");
758    for (int i = 0; i < m_actualRank; i++) {
759      result.append("LatentVariable#" + (i + 1) + "\t");
760    }
761    result.append("AttributeName\n");
762    for (int i = 0; i < m_u.getRowDimension(); i++) { // for each attribute
763      for (int j = 0; j < m_u.getColumnDimension(); j++) { // for each latent variable
764        result.append(Utils.doubleToString(m_u.get(i, j), 9, 5) + "\t\t");
765      }
766      result.append(m_trainInstances.attribute(i).name() + "\n");
767    }
768   
769    // print instance vectors
770    result.append("\n\nInstance vectors (right singular vectors) -- column\n" +
771                  "vectors show the relation between the original instances and the\n" +
772                  "latent variables computed by the singular value decomposition:\n");
773    for (int i = 0; i < m_numInstances; i++) {
774      result.append("Instance#" + (i + 1) + "\t");
775    }
776    result.append("LatentVariable#\n");
777    for (int i = 0; i < m_v.getColumnDimension(); i++) { // for each instance
778      for (int j = 0; j < m_v.getRowDimension(); j++) { // for each latent variable
779        // going down columns instead of across rows because we're
780        // printing v' but have v stored
781        result.append(Utils.doubleToString(m_v.get(j, i), 9, 5) + "\t");
782      }
783      result.append((i + 1) + "\n");
784    }
785   
786    return result.toString();
787  }
788 
789  /**
790   * Returns the revision string.
791   *
792   * @return            the revision
793   */
794  public String getRevision() {
795    return RevisionUtils.extract("$Revision: 5987 $");
796  }
797 
798  /**
799   * Main method for testing this class
800   * @param argv should contain the command line arguments to the
801   * evaluator/transformer (see AttributeSelection)
802   */
803  public static void main(String [] argv) {
804    runEvaluator(new LatentSemanticAnalysis(), argv);
805  }
806}
Note: See TracBrowser for help on using the repository browser.