Context Navigation

source: src/main/java/weka/attributeSelection/LatentSemanticAnalysis.java @ 17

Last change on this file since 17 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 27.6 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* LatentSemanticAnalysis.java
	19	* Copyright (C) 2008 Amri Napolitano
	20	*
	21	*/
	22
	23	package weka.attributeSelection;
	24
	25	import weka.core.Attribute;
	26	import weka.core.Capabilities;
	27	import weka.core.Check;
	28	import weka.core.CheckOptionHandler;
	29	import weka.core.FastVector;
	30	import weka.core.Instance;
	31	import weka.core.DenseInstance;
	32	import weka.core.Instances;
	33	import weka.core.matrix.Matrix;
	34	import weka.core.Option;
	35	import weka.core.OptionHandler;
	36	import weka.core.RevisionUtils;
	37	import weka.core.SparseInstance;
	38	import weka.core.Utils;
	39	import weka.core.Capabilities.Capability;
	40	import weka.core.matrix.SingularValueDecomposition;
	41	import weka.filters.Filter;
	42	import weka.filters.unsupervised.attribute.NominalToBinary;
	43	import weka.filters.unsupervised.attribute.Normalize;
	44	import weka.filters.unsupervised.attribute.Remove;
	45	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	46
	47	import java.io.BufferedReader;
	48	import java.io.File;
	49	import java.io.FileReader;
	50	import java.util.Enumeration;
	51	import java.util.Vector;
	52
	53	/**
	54	<!-- globalinfo-start -->
	55	* Performs latent semantic analysis and transformation of the data.
	56	* Use in conjunction with a Ranker search. A low-rank approximation
	57	* of the full data is found by specifying the number of singular values
	58	* to use. The dataset may be transformed to give the relation of either
	59	* the attributes or the instances (default) to the concept space created
	60	* by the transformation.
	61	* <p/>
	62	<!-- globalinfo-end -->
	63	*
	64	<!-- options-start -->
	65	* Valid options are: <p/>
	66	*
	67	* <pre> -N
	68	* Normalize input data.</pre>
	69	*
	70	* <pre> -R
	71	* Rank approximation used in LSA. May be actual number of
	72	* LSA attributes to include (if greater than 1) or a proportion
	73	* of total singular values to account for (if between 0 and 1).
	74	* A value less than or equal to zero means use all latent variables.
	75	* (default = 0.95)</pre>
	76	*
	77	* <pre> -A
	78	* Maximum number of attributes to include in
	79	* transformed attribute names. (-1 = include all)</pre>
	80	*
	81	<!-- options-end -->
	82	*
	83	* @author Amri Napolitano
	84	* @version $Revision: 5987 $
	85	*/
	86
	87	public class LatentSemanticAnalysis
	88	extends UnsupervisedAttributeEvaluator
	89	implements AttributeTransformer, OptionHandler {
	90
	91	/** For serialization */
	92	static final long serialVersionUID = -8712112988018106198L;
	93
	94	/** The data to transform analyse/transform */
	95	private Instances m_trainInstances;
	96
	97	/**
	98	* Keep a copy for the class attribute (if set) and for
	99	* checking for header compatibility
	100	*/
	101	private Instances m_trainHeader;
	102
	103	/** The header for the transformed data format */
	104	private Instances m_transformedFormat;
	105
	106	/** Data has a class set */
	107	private boolean m_hasClass;
	108
	109	/** Class index */
	110	private int m_classIndex;
	111
	112	/** Number of attributes */
	113	private int m_numAttributes;
	114
	115	/** Number of instances */
	116	private int m_numInstances;
	117
	118	/** Is transpose necessary because numAttributes < numInstances? */
	119	private boolean m_transpose = false;
	120
	121	/** Will hold the left singular vectors */
	122	private Matrix m_u = null;
	123
	124	/** Will hold the singular values */
	125	private Matrix m_s = null;
	126
	127	/** Will hold the right singular values */
	128	private Matrix m_v = null;
	129
	130	/** Will hold the matrix used to transform instances to the new feature space */
	131	private Matrix m_transformationMatrix = null;
	132
	133	/** Filters for original data */
	134	private ReplaceMissingValues m_replaceMissingFilter;
	135	private Normalize m_normalizeFilter;
	136	private NominalToBinary m_nominalToBinaryFilter;
	137	private Remove m_attributeFilter;
	138
	139	/** The number of attributes in the LSA transformed data */
	140	private int m_outputNumAttributes = -1;
	141
	142	/** Normalize the input data? */
	143	private boolean m_normalize = false;
	144
	145	/** The approximation rank to use (between 0 and 1 means coverage proportion) */
	146	private double m_rank = 0.95;
	147
	148	/** The sum of the squares of the singular values */
	149	private double m_sumSquaredSingularValues = 0.0;
	150
	151	/** The actual rank number to use for computation */
	152	private int m_actualRank = -1;
	153
	154	/** Maximum number of attributes in the transformed attribute name */
	155	private int m_maxAttributesInName = 5;
	156
	157	/**
	158	* Returns a string describing this attribute transformer
	159	* @return a description of the evaluator suitable for
	160	* displaying in the explorer/experimenter gui
	161	*/
	162	public String globalInfo() {
	163	return "Performs latent semantic analysis and transformation of the data. Use in " +
	164	"conjunction with a Ranker search. A low-rank approximation of the full data is " +
	165	"found by either specifying the number of singular values to use or specifying a " +
	166	"proportion of the singular values to cover.";
	167	}
	168
	169	/**
	170	* Returns an enumeration describing the available options. <p>
	171	*
	172	* @return an enumeration of all the available options.
	173	**/
	174	public Enumeration listOptions () {
	175	Vector options = new Vector(4);
	176	options.addElement(new Option("\tNormalize input data.", "N", 0, "-N"));
	177
	178	options.addElement(new Option("\tRank approximation used in LSA. \n" +
	179	"\tMay be actual number of LSA attributes \n" +
	180	"\tto include (if greater than 1) or a \n" +
	181	"\tproportion of total singular values to \n" +
	182	"\taccount for (if between 0 and 1). \n" +
	183	"\tA value less than or equal to zero means \n" +
	184	"\tuse all latent variables.(default = 0.95)",
	185	"R",1,"-R"));
	186
	187	options.addElement(new Option("\tMaximum number of attributes to include\n" +
	188	"\tin transformed attribute names.\n" +
	189	"\t(-1 = include all)"
	190	, "A", 1, "-A"));
	191	return options.elements();
	192	}
	193
	194	/**
	195	* Parses a given list of options. <p/>
	196	*
	197	<!-- options-start -->
	198	* Valid options are: <p/>
	199	*
	200	* <pre> -N
	201	* Normalize input data.</pre>
	202	*
	203	* <pre> -R
	204	* Rank approximation used in LSA. May be actual number of
	205	* LSA attributes to include (if greater than 1) or a proportion
	206	* of total singular values to account for (if between 0 and 1).
	207	* A value less than or equal to zero means use all latent variables.
	208	* (default = 0.95)</pre>
	209	*
	210	* <pre> -A
	211	* Maximum number of attributes to include in
	212	* transformed attribute names. (-1 = include all)</pre>
	213	*
	214	<!-- options-end -->
	215	*
	216	* @param options the list of options as an array of strings
	217	* @throws Exception if an option is not supported
	218	*/
	219	public void setOptions (String[] options)
	220	throws Exception {
	221	resetOptions();
	222	String optionString;
	223
	224	//set approximation rank
	225	optionString = Utils.getOption('R', options);
	226	if (optionString.length() != 0) {
	227	double temp;
	228	temp = Double.valueOf(optionString).doubleValue();
	229	setRank(temp);
	230	}
	231
	232	//set number of attributes to use in transformed names
	233	optionString = Utils.getOption('A', options);
	234	if (optionString.length() != 0) {
	235	setMaximumAttributeNames(Integer.parseInt(optionString));
	236	}
	237
	238	//set normalize option
	239	setNormalize(Utils.getFlag('N', options));
	240	}
	241
	242	/**
	243	* Reset to defaults
	244	*/
	245	private void resetOptions() {
	246	m_rank = 0.95;
	247	m_normalize = true;
	248	m_maxAttributesInName = 5;
	249	}
	250
	251	/**
	252	* Returns the tip text for this property
	253	* @return tip text for this property suitable for
	254	* displaying in the explorer/experimenter gui
	255	*/
	256	public String normalizeTipText() {
	257	return "Normalize input data.";
	258	}
	259
	260	/**
	261	* Set whether input data will be normalized.
	262	* @param newNormalize true if input data is to be normalized
	263	*/
	264	public void setNormalize(boolean newNormalize) {
	265	m_normalize = newNormalize;
	266	}
	267
	268	/**
	269	* Gets whether or not input data is to be normalized
	270	* @return true if input data is to be normalized
	271	*/
	272	public boolean getNormalize() {
	273	return m_normalize;
	274	}
	275
	276	/**
	277	* Returns the tip text for this property
	278	* @return tip text for this property suitable for
	279	* displaying in the explorer/experimenter gui
	280	*/
	281	public String rankTipText() {
	282	return "Matrix rank to use for data reduction. Can be a" +
	283	" proportion to indicate desired coverage";
	284	}
	285
	286	/**
	287	* Sets the desired matrix rank (or coverage proportion) for feature-space reduction
	288	* @param newRank the desired rank (or coverage) for feature-space reduction
	289	*/
	290	public void setRank(double newRank) {
	291	m_rank = newRank;
	292	}
	293
	294	/**
	295	* Gets the desired matrix rank (or coverage proportion) for feature-space reduction
	296	* @return the rank (or coverage) for feature-space reduction
	297	*/
	298	public double getRank() {
	299	return m_rank;
	300	}
	301
	302	/**
	303	* Returns the tip text for this property
	304	* @return tip text for this property suitable for
	305	* displaying in the explorer/experimenter gui
	306	*/
	307	public String maximumAttributeNamesTipText() {
	308	return "The maximum number of attributes to include in transformed attribute names.";
	309	}
	310
	311	/**
	312	* Sets maximum number of attributes to include in
	313	* transformed attribute names.
	314	* @param newMaxAttributes the maximum number of attributes
	315	*/
	316	public void setMaximumAttributeNames(int newMaxAttributes) {
	317	m_maxAttributesInName = newMaxAttributes;
	318	}
	319
	320	/**
	321	* Gets maximum number of attributes to include in
	322	* transformed attribute names.
	323	* @return the maximum number of attributes
	324	*/
	325	public int getMaximumAttributeNames() {
	326	return m_maxAttributesInName;
	327	}
	328
	329	/**
	330	* Gets the current settings of LatentSemanticAnalysis
	331	*
	332	* @return an array of strings suitable for passing to setOptions()
	333	*/
	334	public String[] getOptions () {
	335
	336	String[] options = new String[5];
	337	int current = 0;
	338
	339	if (getNormalize()) {
	340	options[current++] = "-N";
	341	}
	342
	343	options[current++] = "-R";
	344	options[current++] = "" + getRank();
	345
	346	options[current++] = "-A";
	347	options[current++] = "" + getMaximumAttributeNames();
	348
	349	while (current < options.length) {
	350	options[current++] = "";
	351	}
	352
	353	return options;
	354	}
	355
	356	/**
	357	* Returns the capabilities of this evaluator.
	358	*
	359	* @return the capabilities of this evaluator
	360	* @see Capabilities
	361	*/
	362	public Capabilities getCapabilities() {
	363	Capabilities result = super.getCapabilities();
	364	result.disableAll();
	365
	366	// attributes
	367	result.enable(Capability.NOMINAL_ATTRIBUTES);
	368	result.enable(Capability.NUMERIC_ATTRIBUTES);
	369	result.enable(Capability.DATE_ATTRIBUTES);
	370	result.enable(Capability.MISSING_VALUES);
	371
	372	// class
	373	result.enable(Capability.NOMINAL_CLASS);
	374	result.enable(Capability.NUMERIC_CLASS);
	375	result.enable(Capability.DATE_CLASS);
	376	result.enable(Capability.MISSING_CLASS_VALUES);
	377	result.enable(Capability.NO_CLASS);
	378
	379	return result;
	380	}
	381
	382	/**
	383	* Initializes the singular values/vectors and performs the analysis
	384	* @param data the instances to analyse/transform
	385	* @throws Exception if analysis fails
	386	*/
	387	public void buildEvaluator(Instances data) throws Exception {
	388	// can evaluator handle data?
	389	getCapabilities().testWithFail(data);
	390
	391	buildAttributeConstructor(data);
	392	}
	393
	394	/**
	395	* Initializes the singular values/vectors and performs the analysis
	396	* @param data the instances to analyse/transform
	397	* @throws Exception if analysis fails
	398	*/
	399	private void buildAttributeConstructor (Instances data) throws Exception {
	400	// initialize attributes for performing analysis
	401	m_transpose = false;
	402	m_s = null;
	403	m_u = null;
	404	m_v = null;
	405	m_outputNumAttributes = -1;
	406	m_actualRank = -1;
	407	m_sumSquaredSingularValues = 0.0;
	408
	409	m_trainInstances = new Instances(data);
	410	m_trainHeader = null;
	411
	412	m_attributeFilter = null;
	413	m_nominalToBinaryFilter = null;
	414
	415	m_replaceMissingFilter = new ReplaceMissingValues();
	416	m_replaceMissingFilter.setInputFormat(m_trainInstances);
	417	m_trainInstances = Filter.useFilter(m_trainInstances, m_replaceMissingFilter);
	418
	419	// vector to hold indices of attributes to delete (class attribute,
	420	// attributes that are all missing, or attributes with one distinct value)
	421	Vector attributesToRemove = new Vector();
	422
	423	// if data has a class attribute
	424	if (m_trainInstances.classIndex() >= 0) {
	425
	426	m_hasClass = true;
	427	m_classIndex = m_trainInstances.classIndex();
	428
	429	// set class attribute to be removed
	430	attributesToRemove.addElement(new Integer(m_classIndex));
	431	}
	432	// make copy of training data so the class values (if set) can be appended to final
	433	// transformed instances and so that we can check header compatibility
	434	m_trainHeader = new Instances(m_trainInstances, 0);
	435
	436	// normalize data if desired
	437	if (m_normalize) {
	438	m_normalizeFilter = new Normalize();
	439	m_normalizeFilter.setInputFormat(m_trainInstances);
	440	m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
	441	}
	442
	443	// convert any nominal attributes to binary numeric attributes
	444	m_nominalToBinaryFilter = new NominalToBinary();
	445	m_nominalToBinaryFilter.setInputFormat(m_trainInstances);
	446	m_trainInstances = Filter.useFilter(m_trainInstances, m_nominalToBinaryFilter);
	447
	448	// delete any attributes with only one distinct value or are all missing
	449	for (int i = 0; i < m_trainInstances.numAttributes(); i++) {
	450	if (m_trainInstances.numDistinctValues(i) <= 1) {
	451	attributesToRemove.addElement(new Integer(i));
	452	}
	453	}
	454
	455	// remove columns from the data if necessary
	456	if (attributesToRemove.size() > 0) {
	457	m_attributeFilter = new Remove();
	458	int [] todelete = new int[attributesToRemove.size()];
	459	for (int i = 0; i < attributesToRemove.size(); i++) {
	460	todelete[i] = ((Integer)(attributesToRemove.elementAt(i))).intValue();
	461	}
	462	m_attributeFilter.setAttributeIndicesArray(todelete);
	463	m_attributeFilter.setInvertSelection(false);
	464	m_attributeFilter.setInputFormat(m_trainInstances);
	465	m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
	466	}
	467
	468	// can evaluator handle the processed data ? e.g., enough attributes?
	469	getCapabilities().testWithFail(m_trainInstances);
	470
	471	// record properties of final, ready-to-process data
	472	m_numInstances = m_trainInstances.numInstances();
	473	m_numAttributes = m_trainInstances.numAttributes();
	474
	475	// create matrix of attribute values and compute singular value decomposition
	476	double [][] trainValues = new double[m_numAttributes][m_numInstances];
	477	for (int i = 0; i < m_numAttributes; i++) {
	478	trainValues[i] = m_trainInstances.attributeToDoubleArray(i);
	479	}
	480	Matrix trainMatrix = new Matrix(trainValues);
	481	// svd requires rows >= columns, so transpose data if necessary
	482	if (m_numAttributes < m_numInstances) {
	483	m_transpose = true;
	484	trainMatrix = trainMatrix.transpose();
	485	}
	486	SingularValueDecomposition trainSVD = trainMatrix.svd();
	487	m_u = trainSVD.getU(); // left singular vectors
	488	m_s = trainSVD.getS(); // singular values
	489	m_v = trainSVD.getV(); // right singular vectors
	490
	491	// find actual rank to use
	492	int maxSingularValues = trainSVD.rank();
	493	for (int i = 0; i < m_s.getRowDimension(); i++) {
	494	m_sumSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
	495	}
	496	if (maxSingularValues == 0) { // no nonzero singular values (shouldn't happen)
	497	// reset values from computation
	498	m_s = null;
	499	m_u = null;
	500	m_v = null;
	501	m_sumSquaredSingularValues = 0.0;
	502
	503	throw new Exception("SVD computation produced no non-zero singular values.");
	504	}
	505	if (m_rank > maxSingularValues \|\| m_rank <= 0) { // adjust rank if too high or too low
	506	m_actualRank = maxSingularValues;
	507	} else if (m_rank < 1.0) { // determine how many singular values to include for desired coverage
	508	double currentSumOfSquaredSingularValues = 0.0;
	509	for (int i = 0; i < m_s.getRowDimension() && m_actualRank == -1; i++) {
	510	currentSumOfSquaredSingularValues += m_s.get(i, i) * m_s.get(i, i);
	511	if (currentSumOfSquaredSingularValues / m_sumSquaredSingularValues >= m_rank) {
	512	m_actualRank = i + 1;
	513	}
	514	}
	515	} else {
	516	m_actualRank = (int) m_rank;
	517	}
	518
	519	// lower matrix ranks, adjust for transposition (if necessary), and
	520	// compute matrix for transforming future instances
	521	if (m_transpose) {
	522	Matrix tempMatrix = m_u;
	523	m_u = m_v;
	524	m_v = tempMatrix;
	525	}
	526	m_u = m_u.getMatrix(0, m_u.getRowDimension() - 1, 0, m_actualRank - 1);
	527	m_s = m_s.getMatrix(0, m_actualRank - 1, 0, m_actualRank - 1);
	528	m_v = m_v.getMatrix(0, m_v.getRowDimension() - 1, 0, m_actualRank - 1);
	529	m_transformationMatrix = m_u.times(m_s.inverse());
	530
	531	//create dataset header for transformed instances
	532	m_transformedFormat = setOutputFormat();
	533	}
	534
	535	/**
	536	* Set the format for the transformed data
	537	* @return a set of empty Instances (header only) in the new format
	538	*/
	539	private Instances setOutputFormat() {
	540	// if analysis hasn't been performed (successfully) yet
	541	if (m_s == null) {
	542	return null;
	543	}
	544
	545	// set up transformed attributes
	546	if (m_hasClass) {
	547	m_outputNumAttributes = m_actualRank + 1;
	548	} else {
	549	m_outputNumAttributes = m_actualRank;
	550	}
	551	int numAttributesInName = m_maxAttributesInName;
	552	if (numAttributesInName <= 0 \|\| numAttributesInName >= m_numAttributes) {
	553	numAttributesInName = m_numAttributes;
	554	}
	555	FastVector attributes = new FastVector(m_outputNumAttributes);
	556	for (int i = 0; i < m_actualRank; i++) {
	557	// create attribute name
	558	String attributeName = "";
	559	double [] attributeCoefficients =
	560	m_transformationMatrix.getMatrix(0, m_numAttributes - 1, i, i).getColumnPackedCopy();
	561	for (int j = 0; j < numAttributesInName; j++) {
	562	if (j > 0) {
	563	attributeName += "+";
	564	}
	565	attributeName += Utils.doubleToString(attributeCoefficients[j], 5, 3);
	566	attributeName += m_trainInstances.attribute(j).name();
	567	}
	568	if (numAttributesInName < m_numAttributes) {
	569	attributeName += "...";
	570	}
	571	// add attribute
	572	attributes.addElement(new Attribute(attributeName));
	573	}
	574	// add original class attribute if present
	575	if (m_hasClass) {
	576	attributes.addElement(m_trainHeader.classAttribute().copy());
	577	}
	578	// create blank header
	579	Instances outputFormat = new Instances(m_trainInstances.relationName() + "_LSA",
	580	attributes, 0);
	581	m_outputNumAttributes = outputFormat.numAttributes();
	582	// set class attribute if applicable
	583	if (m_hasClass) {
	584	outputFormat.setClassIndex(m_outputNumAttributes - 1);
	585	}
	586
	587	return outputFormat;
	588	}
	589
	590	/**
	591	* Returns just the header for the transformed data (ie. an empty
	592	* set of instances. This is so that AttributeSelection can
	593	* determine the structure of the transformed data without actually
	594	* having to get all the transformed data through getTransformedData().
	595	* @return the header of the transformed data.
	596	* @throws Exception if the header of the transformed data can't
	597	* be determined.
	598	*/
	599	public Instances transformedHeader() throws Exception {
	600	if (m_s == null) {
	601	throw new Exception("Latent Semantic Analysis hasn't been successfully performed.");
	602	}
	603	return m_transformedFormat;
	604	}
	605
	606	/**
	607	* Transform the supplied data set (assumed to be the same format
	608	* as the training data)
	609	* @return the transformed training data
	610	* @throws Exception if transformed data can't be returned
	611	*/
	612	public Instances transformedData(Instances data) throws Exception {
	613	if (m_s == null) {
	614	throw new Exception("Latent Semantic Analysis hasn't been built yet");
	615	}
	616
	617	Instances output = new Instances(m_transformedFormat, m_numInstances);
	618
	619	// the transformed version of instance i from the training data
	620	// is stored as the i'th row vector in v (the right singular vectors)
	621	for (int i = 0; i < data.numInstances(); i++) {
	622	Instance currentInstance = data.instance(i);
	623	// record attribute values for converted instance
	624	double [] newValues = new double[m_outputNumAttributes];
	625	for (int j = 0; j < m_actualRank; j++) { // fill in values from v
	626	newValues[j] = m_v.get(i, j);
	627	}
	628	if (m_hasClass) { // copy class value if applicable
	629	newValues[m_outputNumAttributes - 1] = currentInstance.classValue();
	630	}
	631	//create new instance with recorded values and add to output dataset
	632	Instance newInstance;
	633	if (currentInstance instanceof SparseInstance) {
	634	newInstance = new SparseInstance(currentInstance.weight(), newValues);
	635	} else {
	636	newInstance = new DenseInstance(currentInstance.weight(), newValues);
	637	}
	638	output.add(newInstance);
	639	}
	640
	641	return output;
	642	}
	643
	644	/**
	645	* Evaluates the merit of a transformed attribute. This is defined
	646	* to be the square of the singular value for the latent variable
	647	* corresponding to the transformed attribute.
	648	* @param att the attribute to be evaluated
	649	* @return the merit of a transformed attribute
	650	* @throws Exception if attribute can't be evaluated
	651	*/
	652	public double evaluateAttribute(int att) throws Exception {
	653	if (m_s == null) {
	654	throw new Exception("Latent Semantic Analysis hasn't been successfully" +
	655	" performed yet!");
	656	}
	657
	658	//return the square of the corresponding singular value
	659	return (m_s.get(att, att) * m_s.get(att, att)) / m_sumSquaredSingularValues;
	660	}
	661
	662	/**
	663	* Transform an instance in original (unnormalized) format
	664	* @param instance an instance in the original (unnormalized) format
	665	* @return a transformed instance
	666	* @throws Exception if instance can't be transformed
	667	*/
	668	public Instance convertInstance(Instance instance) throws Exception {
	669	if (m_s == null) {
	670	throw new Exception("convertInstance: Latent Semantic Analysis not " +
	671	"performed yet.");
	672	}
	673
	674	// array to hold new attribute values
	675	double [] newValues = new double[m_outputNumAttributes];
	676
	677	// apply filters so new instance is in same format as training instances
	678	Instance tempInstance = (Instance)instance.copy();
	679	if (!instance.dataset().equalHeaders(m_trainHeader)) {
	680	throw new Exception("Can't convert instance: headers don't match: " +
	681	"LatentSemanticAnalysis\n" + instance.dataset().equalHeadersMsg(m_trainHeader));
	682	}
	683	// replace missing values
	684	m_replaceMissingFilter.input(tempInstance);
	685	m_replaceMissingFilter.batchFinished();
	686	tempInstance = m_replaceMissingFilter.output();
	687	// normalize
	688	if (m_normalize) {
	689	m_normalizeFilter.input(tempInstance);
	690	m_normalizeFilter.batchFinished();
	691	tempInstance = m_normalizeFilter.output();
	692	}
	693	// convert nominal attributes to binary
	694	m_nominalToBinaryFilter.input(tempInstance);
	695	m_nominalToBinaryFilter.batchFinished();
	696	tempInstance = m_nominalToBinaryFilter.output();
	697	// remove class/other attributes
	698	if (m_attributeFilter != null) {
	699	m_attributeFilter.input(tempInstance);
	700	m_attributeFilter.batchFinished();
	701	tempInstance = m_attributeFilter.output();
	702	}
	703
	704	// record new attribute values
	705	if (m_hasClass) { // copy class value
	706	newValues[m_outputNumAttributes - 1] = instance.classValue();
	707	}
	708	double [][] oldInstanceValues = new double[1][m_numAttributes];
	709	oldInstanceValues[0] = tempInstance.toDoubleArray();
	710	Matrix instanceVector = new Matrix(oldInstanceValues); // old attribute values
	711	instanceVector = instanceVector.times(m_transformationMatrix); // new attribute values
	712	for (int i = 0; i < m_actualRank; i++) {
	713	newValues[i] = instanceVector.get(0, i);
	714	}
	715
	716	// return newly transformed instance
	717	if (instance instanceof SparseInstance) {
	718	return new SparseInstance(instance.weight(), newValues);
	719	} else {
	720	return new DenseInstance(instance.weight(), newValues);
	721	}
	722	}
	723
	724	/**
	725	* Returns a description of this attribute transformer
	726	* @return a String describing this attribute transformer
	727	*/
	728	public String toString() {
	729	if (m_s == null) {
	730	return "Latent Semantic Analysis hasn't been built yet!";
	731	} else {
	732	return "\tLatent Semantic Analysis Attribute Transformer\n\n"
	733	+ lsaSummary();
	734	}
	735	}
	736
	737	/**
	738	* Return a summary of the analysis
	739	* @return a summary of the analysis.
	740	*/
	741	private String lsaSummary() {
	742	StringBuffer result = new StringBuffer();
	743
	744	// print number of latent variables used
	745	result.append("Number of latent variables utilized: " + m_actualRank);
	746
	747	// print singular values
	748	result.append("\n\nSingularValue\tLatentVariable#\n");
	749	// create single array of singular values rather than diagonal matrix
	750	for (int i = 0; i < m_actualRank; i++) {
	751	result.append(Utils.doubleToString(m_s.get(i, i), 9, 5) + "\t" + (i + 1) + "\n");
	752	}
	753
	754	// print attribute vectors
	755	result.append("\nAttribute vectors (left singular vectors) -- row vectors show\n" +
	756	"the relation between the original attributes and the latent \n" +
	757	"variables computed by the singular value decomposition:\n");
	758	for (int i = 0; i < m_actualRank; i++) {
	759	result.append("LatentVariable#" + (i + 1) + "\t");
	760	}
	761	result.append("AttributeName\n");
	762	for (int i = 0; i < m_u.getRowDimension(); i++) { // for each attribute
	763	for (int j = 0; j < m_u.getColumnDimension(); j++) { // for each latent variable
	764	result.append(Utils.doubleToString(m_u.get(i, j), 9, 5) + "\t\t");
	765	}
	766	result.append(m_trainInstances.attribute(i).name() + "\n");
	767	}
	768
	769	// print instance vectors
	770	result.append("\n\nInstance vectors (right singular vectors) -- column\n" +
	771	"vectors show the relation between the original instances and the\n" +
	772	"latent variables computed by the singular value decomposition:\n");
	773	for (int i = 0; i < m_numInstances; i++) {
	774	result.append("Instance#" + (i + 1) + "\t");
	775	}
	776	result.append("LatentVariable#\n");
	777	for (int i = 0; i < m_v.getColumnDimension(); i++) { // for each instance
	778	for (int j = 0; j < m_v.getRowDimension(); j++) { // for each latent variable
	779	// going down columns instead of across rows because we're
	780	// printing v' but have v stored
	781	result.append(Utils.doubleToString(m_v.get(j, i), 9, 5) + "\t");
	782	}
	783	result.append((i + 1) + "\n");
	784	}
	785
	786	return result.toString();
	787	}
	788
	789	/**
	790	* Returns the revision string.
	791	*
	792	* @return the revision
	793	*/
	794	public String getRevision() {
	795	return RevisionUtils.extract("$Revision: 5987 $");
	796	}
	797
	798	/**
	799	* Main method for testing this class
	800	* @param argv should contain the command line arguments to the
	801	* evaluator/transformer (see AttributeSelection)
	802	*/
	803	public static void main(String [] argv) {
	804	runEvaluator(new LatentSemanticAnalysis(), argv);
	805	}
	806	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: