Context Navigation

source: src/main/java/weka/attributeSelection/LatentSemanticAnalysis.java @ 5

Last change on this file since 5 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 27.6 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* LatentSemanticAnalysis.java
19	* Copyright (C) 2008 Amri Napolitano
20	*
21	*/
22
23	package weka.attributeSelection;
24
25	import weka.core.Attribute;
26	import weka.core.Capabilities;
27	import weka.core.Check;
28	import weka.core.CheckOptionHandler;
29	import weka.core.FastVector;
30	import weka.core.Instance;
31	import weka.core.DenseInstance;
32	import weka.core.Instances;
33	import weka.core.matrix.Matrix;
34	import weka.core.Option;
35	import weka.core.OptionHandler;
36	import weka.core.RevisionUtils;
37	import weka.core.SparseInstance;
38	import weka.core.Utils;
39	import weka.core.Capabilities.Capability;
40	import weka.core.matrix.SingularValueDecomposition;
41	import weka.filters.Filter;
42	import weka.filters.unsupervised.attribute.NominalToBinary;
43	import weka.filters.unsupervised.attribute.Normalize;
44	import weka.filters.unsupervised.attribute.Remove;
45	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
46
47	import java.io.BufferedReader;
48	import java.io.File;
49	import java.io.FileReader;
50	import java.util.Enumeration;
51	import java.util.Vector;
52
53	/**
54	<!-- globalinfo-start -->
55	* Performs latent semantic analysis and transformation of the data.
56	* Use in conjunction with a Ranker search. A low-rank approximation
57	* of the full data is found by specifying the number of singular values
58	* to use. The dataset may be transformed to give the relation of either
59	* the attributes or the instances (default) to the concept space created
60	* by the transformation.
61	* <p/>
62	<!-- globalinfo-end -->
63	*
64	<!-- options-start -->
65	* Valid options are: <p/>
66	*
67	* <pre> -N
68	* Normalize input data.</pre>
69	*
70	* <pre> -R
71	* Rank approximation used in LSA. May be actual number of
72	* LSA attributes to include (if greater than 1) or a proportion
73	* of total singular values to account for (if between 0 and 1).
74	* A value less than or equal to zero means use all latent variables.
75	* (default = 0.95)</pre>
76	*
77	* <pre> -A
78	* Maximum number of attributes to include in
79	* transformed attribute names. (-1 = include all)</pre>
80	*
81	<!-- options-end -->
82	*
83	* @author Amri Napolitano
84	* @version $Revision: 5987 $
85	*/
86
87	public class LatentSemanticAnalysis
88	extends UnsupervisedAttributeEvaluator
89	implements AttributeTransformer, OptionHandler {
90
91	/** For serialization */
92	static final long serialVersionUID = -8712112988018106198L;
93
94	/** The data to transform analyse/transform */
95	private Instances m_trainInstances;
96
97	/**
98	* Keep a copy for the class attribute (if set) and for
99	* checking for header compatibility
100	*/
101	private Instances m_trainHeader;
102
103	/** The header for the transformed data format */
104	private Instances m_transformedFormat;
105
106	/** Data has a class set */
107	private boolean m_hasClass;
108
109	/** Class index */
110	private int m_classIndex;
111
112	/** Number of attributes */
113	private int m_numAttributes;
114
115	/** Number of instances */
116	private int m_numInstances;
117
118	/** Is transpose necessary because numAttributes < numInstances? */
119	private boolean m_transpose = false;
120
121	/** Will hold the left singular vectors */
122	private Matrix m_u = null;
123
124	/** Will hold the singular values */
125	private Matrix m_s = null;
126
127	/** Will hold the right singular values */
128	private Matrix m_v = null;
129
130	/** Will hold the matrix used to transform instances to the new feature space */
131	private Matrix m_transformationMatrix = null;
132
133	/** Filters for original data */
134	private ReplaceMissingValues m_replaceMissingFilter;
135	private Normalize m_normalizeFilter;
136	private NominalToBinary m_nominalToBinaryFilter;
137	private Remove m_attributeFilter;
138
139	/** The number of attributes in the LSA transformed data */
140	private int m_outputNumAttributes = -1;
141
142	/** Normalize the input data? */
143	private boolean m_normalize = false;
144
145	/** The approximation rank to use (between 0 and 1 means coverage proportion) */
146	private double m_rank = 0.95;
147
148	/** The sum of the squares of the singular values */
149	private double m_sumSquaredSingularValues = 0.0;
150
151	/** The actual rank number to use for computation */
152	private int m_actualRank = -1;
153
154	/** Maximum number of attributes in the transformed attribute name */
155	private int m_maxAttributesInName = 5;
156
157	/**
158	* Returns a string describing this attribute transformer
159	* @return a description of the evaluator suitable for
160	* displaying in the explorer/experimenter gui
161	*/
162	public String globalInfo() {
163	return "Performs latent semantic analysis and transformation of the data. Use in " +
164	"conjunction with a Ranker search. A low-rank approximation of the full data is " +
165	"found by either specifying the number of singular values to use or specifying a " +
166	"proportion of the singular values to cover.";
167	}
168
169	/**
170	* Returns an enumeration describing the available options. <p>
171	*
172	* @return an enumeration of all the available options.
173	**/
174	public Enumeration listOptions () {
175	Vector options = new Vector(4);
176	options.addElement(new Option("\tNormalize input data.", "N", 0, "-N"));
177
178	options.addElement(new Option("\tRank approximation used in LSA. \n" +
179	"\tMay be actual number of LSA attributes \n" +
180	"\tto include (if greater than 1) or a \n" +
181	"\tproportion of total singular values to \n" +
182	"\taccount for (if between 0 and 1). \n" +
183	"\tA value less than or equal to zero means \n" +
184	"\tuse all latent variables.(default = 0.95)",
185	"R",1,"-R"));
186
187	options.addElement(new Option("\tMaximum number of attributes to include\n" +
188	"\tin transformed attribute names.\n" +
189	"\t(-1 = include all)"
190	, "A", 1, "-A"));
191	return options.elements();
192	}
193
194	/**
195	* Parses a given list of options. <p/>
196	*
197	<!-- options-start -->
198	* Valid options are: <p/>
199	*
200	* <pre> -N
201	* Normalize input data.</pre>
202	*
203	* <pre> -R
204	* Rank approximation used in LSA. May be actual number of
205	* LSA attributes to include (if greater than 1) or a proportion
206	* of total singular values to account for (if between 0 and 1).
207	* A value less than or equal to zero means use all latent variables.
208	* (default = 0.95)</pre>
209	*
210	* <pre> -A
211	* Maximum number of attributes to include in
212	* transformed attribute names. (-1 = include all)</pre>
213	*
214	<!-- options-end -->
215	*
216	* @param options the list of options as an array of strings
217	* @throws Exception if an option is not supported
218	*/
219	public void setOptions (String[] options)
220	throws Exception {
221	resetOptions();
222	String optionString;
223
224	//set approximation rank
225	optionString = Utils.getOption('R', options);
226	if (optionString.length() != 0) {
227	double temp;
228	temp = Double.valueOf(optionString).doubleValue();
229	setRank(temp);
230	}
231
232	//set number of attributes to use in transformed names
233	optionString = Utils.getOption('A', options);
234	if (optionString.length() != 0) {
235	setMaximumAttributeNames(Integer.parseInt(optionString));
236	}
237
238	//set normalize option
239	setNormalize(Utils.getFlag('N', options));
240	}
241
242	/**
243	* Reset to defaults
244	*/
245	private void resetOptions() {
246	m_rank = 0.95;
247	m_normalize = true;
248	m_maxAttributesInName = 5;
249	}
250
251	/**
252	* Returns the tip text for this property
253	* @return tip text for this property suitable for
254	* displaying in the explorer/experimenter gui
255	*/
256	public String normalizeTipText() {
257	return "Normalize input data.";
258	}
259
260	/**
261	* Set whether input data will be normalized.
262	* @param newNormalize true if input data is to be normalized
263	*/
264	public void setNormalize(boolean newNormalize) {
265	m_normalize = newNormalize;
266	}
267
268	/**
269	* Gets whether or not input data is to be normalized
270	* @return true if input data is to be normalized
271	*/
272	public boolean getNormalize() {
273	return m_normalize;
274	}
275
276	/**
277	* Returns the tip text for this property
278	* @return tip text for this property suitable for
279	* displaying in the explorer/experimenter gui
280	*/
281	public String rankTipText() {
282	return "Matrix rank to use for data reduction. Can be a" +
283	" proportion to indicate desired coverage";
284	}
285
286	/**
287	* Sets the desired matrix rank (or coverage proportion) for feature-space reduction
288	* @param newRank the desired rank (or coverage) for feature-space reduction
289	*/
290	public void setRank(double newRank) {
291	m_rank = newRank;
292	}
293
294	/**
295	* Gets the desired matrix rank (or coverage proportion) for feature-space reduction
296	* @return the rank (or coverage) for feature-space reduction
297	*/
298	public double getRank() {
299	return m_rank;
300	}
301
302	/**
303	* Returns the tip text for this property
304	* @return tip text for this property suitable for
305	* displaying in the explorer/experimenter gui
306	*/
307	public String maximumAttributeNamesTipText() {
308	return "The maximum number of attributes to include in transformed attribute names.";
309	}
310
311	/**
312	* Sets maximum number of attributes to include in
313	* transformed attribute names.
314	* @param newMaxAttributes the maximum number of attributes
315	*/
316	public void setMaximumAttributeNames(int newMaxAttributes) {
317	m_maxAttributesInName = newMaxAttributes;
318	}
319
320	/**
321	* Gets maximum number of attributes to include in
322	* transformed attribute names.
323	* @return the maximum number of attributes
324	*/
325	public int getMaximumAttributeNames() {
326	return m_maxAttributesInName;
327	}
328
329	/**
330	* Gets the current settings of LatentSemanticAnalysis
331	*
332	* @return an array of strings suitable for passing to setOptions()
333	*/
334	public String[] getOptions () {
335
336	String[] options = new String[5];
337	int current = 0;
338
339	if (getNormalize()) {
340	options[current++] = "-N";
341	}
342
343	options[current++] = "-R";
344	options[current++] = "" + getRank();
345
346	options[current++] = "-A";
347	options[current++] = "" + getMaximumAttributeNames();
348
349	while (current < options.length) {
350	options[current++] = "";
351	}
352
353	return options;
354	}
355
356	/**
357	* Returns the capabilities of this evaluator.
358	*
359	* @return the capabilities of this evaluator
360	* @see Capabilities
361	*/
362	public Capabilities getCapabilities() {
363	Capabilities result = super.getCapabilities();
364	result.disableAll();
365
366	// attributes
367	result.enable(Capability.NOMINAL_ATTRIBUTES);
368	result.enable(Capability.NUMERIC_ATTRIBUTES);
369	result.enable(Capability.DATE_ATTRIBUTES);
370	result.enable(Capability.MISSING_VALUES);
371
372	// class
373	result.enable(Capability.NOMINAL_CLASS);
374	result.enable(Capability.NUMERIC_CLASS);
375	result.enable(Capability.DATE_CLASS);
376	result.enable(Capability.MISSING_CLASS_VALUES);
377	result.enable(Capability.NO_CLASS);
378
379	return result;
380	}
381
382	/**
383	* Initializes the singular values/vectors and performs the analysis
384	* @param data the instances to analyse/transform
385	* @throws Exception if analysis fails
386	*/
387	public void buildEvaluator(Instances data) throws Exception {
388	// can evaluator handle data?
389	getCapabilities().testWithFail(data);
390
391	buildAttributeConstructor(data);
392	}
393
394	/**
395	* Initializes the singular values/vectors and performs the analysis
396	* @param data the instances to analyse/transform
397	* @throws Exception if analysis fails
398	*/
399	private void buildAttributeConstructor (Instances data) throws Exception {
400	// initialize attributes for performing analysis
401	m_transpose = false;
402	m_s = null;
403	m_u = null;
404	m_v = null;
405	m_outputNumAttributes = -1;
406	m_actualRank = -1;
407	m_sumSquaredSingularValues = 0.0;
408
409	m_trainInstances = new Instances(data);
410	m_trainHeader = null;
411
412	m_attributeFilter = null;
413	m_nominalToBinaryFilter = null;
414
415	m_replaceMissingFilter = new ReplaceMissingValues();
416	m_replaceMissingFilter.setInputFormat(m_trainInstances);
417	m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter);
418
419	// vector to hold indices of attributes to delete (class attribute,
420	// attributes that are all missing, or attributes with one distinct value)
421	Vector attributesToRemove = new Vector();
422
423	// if data has a class attribute
424	if (m_trainInstances.classIndex() >= 0) {
425
426	m_hasClass = true;
427	m_classIndex = m_trainInstances.classIndex();
428
429	// set class attribute to be removed
430	attributesToRemove.addElement(new Integer(m_classIndex));
431	}
432	// make copy of training data so the class values (if set) can be appended to final
433	// transformed instances and so that we can check header compatibility
434	m_trainHeader = new Instances(m_trainInstances, 0);
435
436	// normalize data if desired
437	if (m_normalize) {
438	m_normalizeFilter = new Normalize();
439	m_normalizeFilter.setInputFormat(m_trainInstances);
440	m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
441	}
442
443	// convert any nominal attributes to binary numeric attributes
444	m_nominalToBinaryFilter = new NominalToBinary();
445	m_nominalToBinaryFilter.setInputFormat(m_trainInstances);
446	m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinaryFilter);
447
448	// delete any attributes with only one distinct value or are all missing
449	for (int i = 0; i < m_trainInstances.numAttributes(); i++) {
450	if (m_trainInstances.numDistinctValues(i) <= 1) {
451	attributesToRemove.addElement(new Integer(i));
452	}
453	}
454
455	// remove columns from the data if necessary
456	if (attributesToRemove.size() > 0) {
457	m_attributeFilter = new Remove();
458	int [] todelete = new int[attributesToRemove.size()];
459	for (int i = 0; i < attributesToRemove.size(); i++) {
460	todelete[i] = ((Integer)(attributesToRemove.elementAt(i))).intValue();
461	}
462	m_attributeFilter.setAttributeIndicesArray(todelete);
463	m_attributeFilter.setInvertSelection(false);
464	m_attributeFilter.setInputFormat(m_trainInstances);
465	m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
466	}
467
468	// can evaluator handle the processed data ? e.g., enough attributes?
469	getCapabilities().testWithFail(m_trainInstances);
470
471	// record properties of final, ready-to-process data
472	m_numInstances = m_trainInstances.numInstances();
473	m_numAttributes = m_trainInstances.numAttributes();
474
475	// create matrix of attribute values and compute singular value decomposition
476	double [][] trainValues = new double[m_numAttributes][m_numInstances];
477	for (int i = 0; i < m_numAttributes; i++) {
478	trainValues[i] = m_trainInstances.attributeToDoubleArray(i);
479	}
480	Matrix trainMatrix = new Matrix(trainValues);
481	// svd requires rows >= columns, so transpose data if necessary
482	if (m_numAttributes < m_numInstances) {
483	m_transpose = true;
484	trainMatrix = trainMatrix.transpose();
485	}
486	SingularValueDecomposition trainSVD = trainMatrix.svd();
487	m_u = trainSVD.getU(); // left singular vectors
488	m_s = trainSVD.getS(); // singular values
489	m_v = trainSVD.getV(); // right singular vectors
490
491	// find actual rank to use
492	int maxSingularValues = trainSVD.rank();
493	for (int i = 0; i < m_s.getRowDimension(); i++) {
494	m_sumSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
495	}
496	if (maxSingularValues == 0) { // no nonzero singular values (shouldn't happen)
497	// reset values from computation
498	m_s = null;
499	m_u = null;
500	m_v = null;
501	m_sumSquaredSingularValues = 0.0;
502
503	throw new Exception("SVD computation produced no non-zero singular values.");
504	}
505	if (m_rank > maxSingularValues \|\| m_rank <= 0) { // adjust rank if too high or too low
506	m_actualRank = maxSingularValues;
507	} else if (m_rank < 1.0) { // determine how many singular values to include for desired coverage
508	double currentSumOfSquaredSingularValues = 0.0;
509	for (int i = 0; i < m_s.getRowDimension() && m_actualRank == -1; i++) {
510	currentSumOfSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
511	if (currentSumOfSquaredSingularValues / m_sumSquaredSingularValues >= m_rank) {
512	m_actualRank = i + 1;
513	}
514	}
515	} else {
516	m_actualRank = (int) m_rank;
517	}
518
519	// lower matrix ranks, adjust for transposition (if necessary), and
520	// compute matrix for transforming future instances
521	if (m_transpose) {
522	Matrix tempMatrix = m_u;
523	m_u = m_v;
524	m_v = tempMatrix;
525	}
526	m_u = m_u.getMatrix(0, m_u.getRowDimension() - 1, 0, m_actualRank - 1);
527	m_s = m_s.getMatrix(0, m_actualRank - 1, 0, m_actualRank - 1);
528	m_v = m_v.getMatrix(0, m_v.getRowDimension() - 1, 0, m_actualRank - 1);
529	m_transformationMatrix = m_u.times(m_s.inverse());
530
531	//create dataset header for transformed instances
532	m_transformedFormat = setOutputFormat();
533	}
534
535	/**
536	* Set the format for the transformed data
537	* @return a set of empty Instances (header only) in the new format
538	*/
539	private Instances setOutputFormat() {
540	// if analysis hasn't been performed (successfully) yet
541	if (m_s == null) {
542	return null;
543	}
544
545	// set up transformed attributes
546	if (m_hasClass) {
547	m_outputNumAttributes = m_actualRank + 1;
548	} else {
549	m_outputNumAttributes = m_actualRank;
550	}
551	int numAttributesInName = m_maxAttributesInName;
552	if (numAttributesInName <= 0 \|\| numAttributesInName >= m_numAttributes) {
553	numAttributesInName = m_numAttributes;
554	}
555	FastVector attributes = new FastVector(m_outputNumAttributes);
556	for (int i = 0; i < m_actualRank; i++) {
557	// create attribute name
558	String attributeName = "";
559	double [] attributeCoefficients =
560	m_transformationMatrix.getMatrix(0, m_numAttributes - 1, i, i).getColumnPackedCopy();
561	for (int j = 0; j < numAttributesInName; j++) {
562	if (j > 0) {
563	attributeName += "+";
564	}
565	attributeName += Utils.doubleToString(attributeCoefficients[j], 5, 3);
566	attributeName += m_trainInstances.attribute(j).name();
567	}
568	if (numAttributesInName < m_numAttributes) {
569	attributeName += "...";
570	}
571	// add attribute
572	attributes.addElement(new Attribute(attributeName));
573	}
574	// add original class attribute if present
575	if (m_hasClass) {
576	attributes.addElement(m_trainHeader.classAttribute().copy());
577	}
578	// create blank header
579	Instances outputFormat = new Instances(m_trainInstances.relationName() + "_LSA",
580	attributes, 0);
581	m_outputNumAttributes = outputFormat.numAttributes();
582	// set class attribute if applicable
583	if (m_hasClass) {
584	outputFormat.setClassIndex(m_outputNumAttributes - 1);
585	}
586
587	return outputFormat;
588	}
589
590	/**
591	* Returns just the header for the transformed data (ie. an empty
592	* set of instances. This is so that AttributeSelection can
593	* determine the structure of the transformed data without actually
594	* having to get all the transformed data through getTransformedData().
595	* @return the header of the transformed data.
596	* @throws Exception if the header of the transformed data can't
597	* be determined.
598	*/
599	public Instances transformedHeader() throws Exception {
600	if (m_s == null) {
601	throw new Exception("Latent Semantic Analysis hasn't been successfully performed.");
602	}
603	return m_transformedFormat;
604	}
605
606	/**
607	* Transform the supplied data set (assumed to be the same format
608	* as the training data)
609	* @return the transformed training data
610	* @throws Exception if transformed data can't be returned
611	*/
612	public Instances transformedData(Instances data) throws Exception {
613	if (m_s == null) {
614	throw new Exception("Latent Semantic Analysis hasn't been built yet");
615	}
616
617	Instances output = new Instances(m_transformedFormat, m_numInstances);
618
619	// the transformed version of instance i from the training data
620	// is stored as the i'th row vector in v (the right singular vectors)
621	for (int i = 0; i < data.numInstances(); i++) {
622	Instance currentInstance = data.instance(i);
623	// record attribute values for converted instance
624	double [] newValues = new double[m_outputNumAttributes];
625	for (int j = 0; j < m_actualRank; j++) { // fill in values from v
626	newValues[j] = m_v.get(i, j);
627	}
628	if (m_hasClass) { // copy class value if applicable
629	newValues[m_outputNumAttributes - 1] = currentInstance.classValue();
630	}
631	//create new instance with recorded values and add to output dataset
632	Instance newInstance;
633	if (currentInstance instanceof SparseInstance) {
634	newInstance = new SparseInstance(currentInstance.weight(), newValues);
635	} else {
636	newInstance = new DenseInstance(currentInstance.weight(), newValues);
637	}
638	output.add(newInstance);
639	}
640
641	return output;
642	}
643
644	/**
645	* Evaluates the merit of a transformed attribute. This is defined
646	* to be the square of the singular value for the latent variable
647	* corresponding to the transformed attribute.
648	* @param att the attribute to be evaluated
649	* @return the merit of a transformed attribute
650	* @throws Exception if attribute can't be evaluated
651	*/
652	public double evaluateAttribute(int att) throws Exception {
653	if (m_s == null) {
654	throw new Exception("Latent Semantic Analysis hasn't been successfully" +
655	" performed yet!");
656	}
657
658	//return the square of the corresponding singular value
659	return (m_s.get(att, att) * m_s.get(att, att)) / m_sumSquaredSingularValues;
660	}
661
662	/**
663	* Transform an instance in original (unnormalized) format
664	* @param instance an instance in the original (unnormalized) format
665	* @return a transformed instance
666	* @throws Exception if instance can't be transformed
667	*/
668	public Instance convertInstance(Instance instance) throws Exception {
669	if (m_s == null) {
670	throw new Exception("convertInstance: Latent Semantic Analysis not " +
671	"performed yet.");
672	}
673
674	// array to hold new attribute values
675	double [] newValues = new double[m_outputNumAttributes];
676
677	// apply filters so new instance is in same format as training instances
678	Instance tempInstance = (Instance)instance.copy();
679	if (!instance.dataset().equalHeaders(m_trainHeader)) {
680	throw new Exception("Can't convert instance: headers don't match: " +
681	"LatentSemanticAnalysis\n" + instance.dataset().equalHeadersMsg(m_trainHeader));
682	}
683	// replace missing values
684	m_replaceMissingFilter.input(tempInstance);
685	m_replaceMissingFilter.batchFinished();
686	tempInstance = m_replaceMissingFilter.output();
687	// normalize
688	if (m_normalize) {
689	m_normalizeFilter.input(tempInstance);
690	m_normalizeFilter.batchFinished();
691	tempInstance = m_normalizeFilter.output();
692	}
693	// convert nominal attributes to binary
694	m_nominalToBinaryFilter.input(tempInstance);
695	m_nominalToBinaryFilter.batchFinished();
696	tempInstance = m_nominalToBinaryFilter.output();
697	// remove class/other attributes
698	if (m_attributeFilter != null) {
699	m_attributeFilter.input(tempInstance);
700	m_attributeFilter.batchFinished();
701	tempInstance = m_attributeFilter.output();
702	}
703
704	// record new attribute values
705	if (m_hasClass) { // copy class value
706	newValues[m_outputNumAttributes - 1] = instance.classValue();
707	}
708	double [][] oldInstanceValues = new double[1][m_numAttributes];
709	oldInstanceValues[0] = tempInstance.toDoubleArray();
710	Matrix instanceVector = new Matrix(oldInstanceValues); // old attribute values
711	instanceVector = instanceVector.times(m_transformationMatrix); // new attribute values
712	for (int i = 0; i < m_actualRank; i++) {
713	newValues[i] = instanceVector.get(0, i);
714	}
715
716	// return newly transformed instance
717	if (instance instanceof SparseInstance) {
718	return new SparseInstance(instance.weight(), newValues);
719	} else {
720	return new DenseInstance(instance.weight(), newValues);
721	}
722	}
723
724	/**
725	* Returns a description of this attribute transformer
726	* @return a String describing this attribute transformer
727	*/
728	public String toString() {
729	if (m_s == null) {
730	return "Latent Semantic Analysis hasn't been built yet!";
731	} else {
732	return "\tLatent Semantic Analysis Attribute Transformer\n\n"
733	+ lsaSummary();
734	}
735	}
736
737	/**
738	* Return a summary of the analysis
739	* @return a summary of the analysis.
740	*/
741	private String lsaSummary() {
742	StringBuffer result = new StringBuffer();
743
744	// print number of latent variables used
745	result.append("Number of latent variables utilized: " + m_actualRank);
746
747	// print singular values
748	result.append("\n\nSingularValue\tLatentVariable#\n");
749	// create single array of singular values rather than diagonal matrix
750	for (int i = 0; i < m_actualRank; i++) {
751	result.append(Utils.doubleToString(m_s.get(i, i), 9, 5) + "\t" + (i + 1) + "\n");
752	}
753
754	// print attribute vectors
755	result.append("\nAttribute vectors (left singular vectors) -- row vectors show\n" +
756	"the relation between the original attributes and the latent \n" +
757	"variables computed by the singular value decomposition:\n");
758	for (int i = 0; i < m_actualRank; i++) {
759	result.append("LatentVariable#" + (i + 1) + "\t");
760	}
761	result.append("AttributeName\n");
762	for (int i = 0; i < m_u.getRowDimension(); i++) { // for each attribute
763	for (int j = 0; j < m_u.getColumnDimension(); j++) { // for each latent variable
764	result.append(Utils.doubleToString(m_u.get(i, j), 9, 5) + "\t\t");
765	}
766	result.append(m_trainInstances.attribute(i).name() + "\n");
767	}
768
769	// print instance vectors
770	result.append("\n\nInstance vectors (right singular vectors) -- column\n" +
771	"vectors show the relation between the original instances and the\n" +
772	"latent variables computed by the singular value decomposition:\n");
773	for (int i = 0; i < m_numInstances; i++) {
774	result.append("Instance#" + (i + 1) + "\t");
775	}
776	result.append("LatentVariable#\n");
777	for (int i = 0; i < m_v.getColumnDimension(); i++) { // for each instance
778	for (int j = 0; j < m_v.getRowDimension(); j++) { // for each latent variable
779	// going down columns instead of across rows because we're
780	// printing v' but have v stored
781	result.append(Utils.doubleToString(m_v.get(j, i), 9, 5) + "\t");
782	}
783	result.append((i + 1) + "\n");
784	}
785
786	return result.toString();
787	}
788
789	/**
790	* Returns the revision string.
791	*
792	* @return the revision
793	*/
794	public String getRevision() {
795	return RevisionUtils.extract("$Revision: 5987 $");
796	}
797
798	/**
799	* Main method for testing this class
800	* @param argv should contain the command line arguments to the
801	* evaluator/transformer (see AttributeSelection)
802	*/
803	public static void main(String [] argv) {
804	runEvaluator(new LatentSemanticAnalysis(), argv);
805	}
806	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: