Context Navigation

source: src/main/java/weka/clusterers/sIB.java @ 25

Last change on this file since 25 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 35.2 KB

Rev	Line
[4]	1	/*
	2	* This program is free software; you can redistribute it and/or modify
	3	* it under the terms of the GNU General Public License as published by
	4	* the Free Software Foundation; either version 2 of the License, or
	5	* (at your option) any later version.
	6	*
	7	* This program is distributed in the hope that it will be useful,
	8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	* GNU General Public License for more details.
	11	*
	12	* You should have received a copy of the GNU General Public License
	13	* along with this program; if not, write to the Free Software
	14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	15	*/
	16
	17	/*
	18	* sIB.java
	19	* Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
	20	*
	21	*/
	22
	23	package weka.clusterers;
	24
	25	import weka.core.Capabilities;
	26	import weka.core.Instance;
	27	import weka.core.DenseInstance;
	28	import weka.core.Instances;
	29	import weka.core.Option;
	30	import weka.core.RevisionHandler;
	31	import weka.core.RevisionUtils;
	32	import weka.core.TechnicalInformation;
	33	import weka.core.TechnicalInformationHandler;
	34	import weka.core.Utils;
	35	import weka.core.Capabilities.Capability;
	36	import weka.core.TechnicalInformation.Field;
	37	import weka.core.TechnicalInformation.Type;
	38	import weka.core.matrix.Matrix;
	39	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
	40
	41	import java.io.Serializable;
	42	import java.util.ArrayList;
	43	import java.util.Enumeration;
	44	import java.util.Random;
	45	import java.util.Vector;
	46
	47	/**
	48	<!-- globalinfo-start -->
	49	* Cluster data using the sequential information bottleneck algorithm.<br/>
	50	* <br/>
	51	* Note: only hard clustering scheme is supported. sIB assign for each instance the cluster that have the minimum cost/distance to the instance. The trade-off beta is set to infinite so 1/beta is zero.<br/>
	52	* <br/>
	53	* For more information, see:<br/>
	54	* <br/>
	55	* Noam Slonim, Nir Friedman, Naftali Tishby: Unsupervised document classification using sequential information maximization. In: Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval, 129-136, 2002.
	56	* <p/>
	57	<!-- globalinfo-end -->
	58	*
	59	<!-- technical-bibtex-start -->
	60	* BibTeX:
	61	* <pre>
	62	* @inproceedings{Slonim2002,
	63	* author = {Noam Slonim and Nir Friedman and Naftali Tishby},
	64	* booktitle = {Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval},
	65	* pages = {129-136},
	66	* title = {Unsupervised document classification using sequential information maximization},
	67	* year = {2002}
	68	* }
	69	* </pre>
	70	* <p/>
	71	<!-- technical-bibtex-end -->
	72	*
	73	<!-- options-start -->
	74	* Valid options are: <p/>
	75	*
	76	* <pre> -I <num>
	77	* maximum number of iterations
	78	* (default 100).</pre>
	79	*
	80	* <pre> -M <num>
	81	* minimum number of changes in a single iteration
	82	* (default 0).</pre>
	83	*
	84	* <pre> -N <num>
	85	* number of clusters.
	86	* (default 2).</pre>
	87	*
	88	* <pre> -R <num>
	89	* number of restarts.
	90	* (default 5).</pre>
	91	*
	92	* <pre> -U
	93	* set not to normalize the data
	94	* (default true).</pre>
	95	*
	96	* <pre> -V
	97	* set to output debug info
	98	* (default false).</pre>
	99	*
	100	* <pre> -S <num>
	101	* Random number seed.
	102	* (default 1)</pre>
	103	*
	104	<!-- options-end -->
	105	*
	106	* @author Noam Slonim
	107	* @author <a href="mailto:lh92@cs.waikato.ac.nz">Anna Huang</a>
	108	* @version $Revision: 5987 $
	109	*/
	110	public class sIB
	111	extends RandomizableClusterer
	112	implements TechnicalInformationHandler {
	113
	114	/** for serialization. */
	115	private static final long serialVersionUID = -8652125897352654213L;
	116
	117	/**
	118	* Inner class handling status of the input data
	119	*
	120	* @see Serializable
	121	*/
	122	private class Input
	123	implements Serializable, RevisionHandler {
	124
	125	/** for serialization */
	126	static final long serialVersionUID = -2464453171263384037L;
	127
	128	/** Prior probability of each instance */
	129	private double[] Px;
	130
	131	/** Prior probability of each attribute */
	132	private double[] Py;
	133
	134	/** Joint distribution of attribute and instance */
	135	private Matrix Pyx;
	136
	137	/** P[y\|x] */
	138	private Matrix Py_x;
	139
	140	/** Mutual information between the instances and the attributes */
	141	private double Ixy;
	142
	143	/** Entropy of the attributes */
	144	private double Hy;
	145
	146	/** Entropy of the instances */
	147	private double Hx;
	148
	149	/** Sum values of the dataset */
	150	private double sumVals;
	151
	152	/**
	153	* Returns the revision string.
	154	*
	155	* @return the revision
	156	*/
	157	public String getRevision() {
	158	return RevisionUtils.extract("$Revision: 5987 $");
	159	}
	160	}
	161
	162	/**
	163	* Internal class handling the whole partition
	164	*
	165	* @see Serializable
	166	*/
	167	private class Partition
	168	implements Serializable, RevisionHandler {
	169
	170	/** for serialization */
	171	static final long serialVersionUID = 4957194978951259946L;
	172
	173	/** Cluster assignment for each instance */
	174	private int[] Pt_x;
	175
	176	/** Prior probability of each cluster */
	177	private double[] Pt;
	178
	179	/** sIB equation score, to evaluate the quality of the partition */
	180	private double L;
	181
	182	/** Number of changes during the generation of this partition */
	183	private int counter;
	184
	185	/** Attribute probablities for each cluster */
	186	private Matrix Py_t;
	187
	188	/**
	189	* Create a new empty <code>Partition</code> instance.
	190	*/
	191	public Partition() {
	192	Pt_x = new int[m_numInstances];
	193	for (int i = 0; i < m_numInstances; i++) {
	194	Pt_x[i] = -1;
	195	}
	196	Pt = new double[m_numCluster];
	197	Py_t = new Matrix(m_numAttributes, m_numCluster);
	198	counter = 0;
	199	}
	200
	201	/**
	202	* Find all the instances that have been assigned to cluster i
	203	* @param i index of the cluster
	204	* @return an arraylist of the instance ids that have been assigned to cluster i
	205	*/
	206	private ArrayList<Integer> find(int i) {
	207	ArrayList<Integer> indices = new ArrayList<Integer>();
	208	for (int x = 0; x < Pt_x.length; x++) {
	209	if (Pt_x[x] == i) {
	210	indices.add(x);
	211	}
	212	}
	213	return indices;
	214	}
	215
	216	/**
	217	* Find the size of the cluster i
	218	* @param i index of the cluster
	219	* @return the size of cluster i
	220	*/
	221	private int size(int i) {
	222	int count = 0;
	223	for (int x = 0; x < Pt_x.length; x++) {
	224	if (Pt_x[x] == i) {
	225	count++;
	226	}
	227	}
	228	return count;
	229	}
	230
	231	/**
	232	* Copy the current partition into T
	233	* @param T the target partition object
	234	*/
	235	private void copy(Partition T) {
	236	if (T == null) {
	237	T = new Partition();
	238	}
	239	System.arraycopy(Pt_x, 0, T.Pt_x, 0, Pt_x.length);
	240	System.arraycopy(Pt, 0, T.Pt, 0, Pt.length);
	241	T.L = L;
	242	T.counter = counter;
	243
	244	double[][] mArray = Py_t.getArray();
	245	double[][] tgtArray = T.Py_t.getArray();
	246	for (int i = 0; i < mArray.length; i++) {
	247	System.arraycopy(mArray[i], 0, tgtArray[i], 0, mArray[0].length);
	248	}
	249	}
	250
	251	/**
	252	* Output the current partition
	253	* @param insts
	254	* @return a string that describes the partition
	255	*/
	256	public String toString() {
	257	StringBuffer text = new StringBuffer();
	258	text.append("score (L) : " + Utils.doubleToString(L, 4) + "\n");
	259	text.append("number of changes : " + counter +"\n");
	260	for (int i = 0; i < m_numCluster; i++) {
	261	text.append("\nCluster "+i+"\n");
	262	text.append("size : "+size(i)+"\n");
	263	text.append("prior prob : "+Utils.doubleToString(Pt[i], 4)+"\n");
	264	}
	265	return text.toString();
	266	}
	267
	268	/**
	269	* Returns the revision string.
	270	*
	271	* @return the revision
	272	*/
	273	public String getRevision() {
	274	return RevisionUtils.extract("$Revision: 5987 $");
	275	}
	276	}
	277
	278	/** Training data */
	279	private Instances m_data;
	280
	281	/** Number of clusters */
	282	private int m_numCluster = 2;
	283
	284	/** Number of restarts */
	285	private int m_numRestarts = 5;
	286
	287	/** Verbose? */
	288	private boolean m_verbose = false;
	289
	290	/** Uniform prior probability of the documents */
	291	private boolean m_uniformPrior = true;
	292
	293	/** Max number of iterations during each restart */
	294	private int m_maxLoop = 100;
	295
	296	/** Minimum number of changes */
	297	private int m_minChange = 0;
	298
	299	/** Globally replace missing values */
	300	private ReplaceMissingValues m_replaceMissing;
	301
	302	/** Number of instances */
	303	private int m_numInstances;
	304
	305	/** Number of attributes */
	306	private int m_numAttributes;
	307
	308	/** Randomly generate initial partition */
	309	private Random random;
	310
	311	/** Holds the best partition built */
	312	private Partition bestT;
	313
	314	/** Holds the statistics about the input dataset */
	315	private Input input;
	316
	317	/**
	318	* Generates a clusterer.
	319	*
	320	* @param data the training instances
	321	* @throws Exception if something goes wrong
	322	*/
	323	public void buildClusterer(Instances data) throws Exception {
	324	// can clusterer handle the data ?
	325	getCapabilities().testWithFail(data);
	326
	327	m_replaceMissing = new ReplaceMissingValues();
	328	Instances instances = new Instances(data);
	329	instances.setClassIndex(-1);
	330	m_replaceMissing.setInputFormat(instances);
	331	data = weka.filters.Filter.useFilter(instances, m_replaceMissing);
	332	instances = null;
	333
	334	// initialize all fields that are not being set via options
	335	m_data = data;
	336	m_numInstances = m_data.numInstances();
	337	m_numAttributes = m_data.numAttributes();
	338	random = new Random(getSeed());
	339
	340	// initialize the statistics of the input training data
	341	input = sIB_ProcessInput();
	342
	343	// object to hold the best partition
	344	bestT = new Partition();
	345
	346	// the real clustering
	347	double bestL = Double.NEGATIVE_INFINITY;
	348	for (int k = 0; k < m_numRestarts; k++) {
	349	if(m_verbose) {
	350	System.out.format("restart number %s...\n", k);
	351	}
	352
	353	// initialize the partition and optimize it
	354	Partition tmpT = sIB_InitT(input);
	355	tmpT = sIB_OptimizeT(tmpT, input);
	356
	357	// if a better partition is found, save it
	358	if (tmpT.L > bestL) {
	359	tmpT.copy(bestT);
	360	bestL = bestT.L;
	361	}
	362
	363	if(m_verbose) {
	364	System.out.println("\nPartition status : ");
	365	System.out.println("------------------");
	366	System.out.println(tmpT.toString()+"\n");
	367	}
	368	}
	369
	370	if(m_verbose){
	371	System.out.println("\nBest Partition");
	372	System.out.println("===============");
	373	System.out.println(bestT.toString());
	374	}
	375
	376	// save memory
	377	m_data = new Instances(m_data, 0);
	378	}
	379
	380	/**
	381	* Cluster a given instance, this is the method defined in Clusterer
	382	* interface do nothing but just return the cluster assigned to it
	383	*/
	384	public int clusterInstance(Instance instance) throws Exception {
	385	double prior = (double) 1 / input.sumVals;
	386	double[] distances = new double[m_numCluster];
	387	for(int i = 0; i < m_numCluster; i++){
	388	double Pnew = bestT.Pt[i] + prior;
	389	double pi1 = prior / Pnew;
	390	double pi2 = bestT.Pt[i] / Pnew;
	391	distances[i] = Pnew * JS(instance, i, pi1, pi2);
	392	}
	393	return Utils.minIndex(distances);
	394	}
	395
	396	/**
	397	* Process the input and compute the statistics of the training data
	398	* @return an Input object which holds the statistics about the training data
	399	*/
	400	private Input sIB_ProcessInput() {
	401	double valSum = 0.0;
	402	for (int i = 0; i < m_numInstances; i++) {
	403	valSum = 0.0;
	404	for (int v = 0; v < m_data.instance(i).numValues(); v++) {
	405	valSum += m_data.instance(i).valueSparse(v);
	406	}
	407	if (valSum <= 0) {
	408	if(m_verbose){
	409	System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum);
	410	}
	411	m_data.delete(i);
	412	m_numInstances--;
	413	}
	414	}
	415
	416	// get the term-document matrix
	417	Input input = new Input();
	418	input.Py_x = getTransposedNormedMatrix(m_data);
	419	if (m_uniformPrior) {
	420	input.Pyx = input.Py_x.copy();
	421	normalizePrior(m_data);
	422	}
	423	else {
	424	input.Pyx = getTransposedMatrix(m_data);
	425	}
	426	input.sumVals = getTotalSum(m_data);
	427	input.Pyx.timesEquals((double) 1 / input.sumVals);
	428
	429	// prior probability of documents, ie. sum the columns from the Pyx matrix
	430	input.Px = new double[m_numInstances];
	431	for (int i = 0; i < m_numInstances; i++) {
	432	for (int j = 0; j < m_numAttributes; j++) {
	433	input.Px[i] += input.Pyx.get(j, i);
	434	}
	435	}
	436
	437	// prior probability of terms, ie. sum the rows from the Pyx matrix
	438	input.Py = new double[m_numAttributes];
	439	for (int i = 0; i < input.Pyx.getRowDimension(); i++) {
	440	for (int j = 0; j < input.Pyx.getColumnDimension(); j++) {
	441	input.Py[i] += input.Pyx.get(i, j);
	442	}
	443	}
	444
	445	MI(input.Pyx, input);
	446	return input;
	447	}
	448
	449	/**
	450	* Initialize the partition
	451	* @param input object holding the statistics of the training data
	452	* @return the initialized partition
	453	*/
	454	private Partition sIB_InitT(Input input) {
	455	Partition T = new Partition();
	456	int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster);
	457
	458	ArrayList<Integer> permInstsIdx = new ArrayList<Integer>();
	459	ArrayList<Integer> unassigned = new ArrayList<Integer>();
	460	for (int i = 0; i < m_numInstances; i++) {
	461	unassigned.add(i);
	462	}
	463	while (unassigned.size() != 0) {
	464	int t = random.nextInt(unassigned.size());
	465	permInstsIdx.add(unassigned.get(t));
	466	unassigned.remove(t);
	467	}
	468
	469	for (int i = 0; i < m_numCluster; i++) {
	470	int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize;
	471	for (int j = 0; j < r2; j++) {
	472	T.Pt_x[permInstsIdx.get(j)] = i;
	473	}
	474	for (int j = 0; j < r2; j++) {
	475	permInstsIdx.remove(0);
	476	}
	477	}
	478
	479	// initialize the prior prob of each cluster, and the probability
	480	// for each attribute within the cluster
	481	for (int i = 0; i < m_numCluster; i++) {
	482	ArrayList<Integer> indices = T.find(i);
	483	for (int j = 0; j < indices.size(); j++) {
	484	T.Pt[i] += input.Px[indices.get(j)];
	485	}
	486	double[][] mArray = input.Pyx.getArray();
	487	for (int j = 0; j < m_numAttributes; j++) {
	488	double sum = 0.0;
	489	for (int k = 0; k < indices.size(); k++) {
	490	sum += mArray[j][indices.get(k)];
	491	}
	492	sum /= T.Pt[i];
	493	T.Py_t.set(j, i, sum);
	494	}
	495	}
	496
	497	if(m_verbose) {
	498	System.out.println("Initializing...");
	499	}
	500	return T;
	501	}
	502
	503	/**
	504	* Optimize the partition
	505	* @param tmpT partition to be optimized
	506	* @param input object describing the statistics of the training dataset
	507	* @return the optimized partition
	508	*/
	509	private Partition sIB_OptimizeT(Partition tmpT, Input input) {
	510	boolean done = false;
	511	int change = 0, loopCounter = 0;
	512	if(m_verbose) {
	513	System.out.println("Optimizing...");
	514	System.out.println("-------------");
	515	}
	516	while (!done) {
	517	change = 0;
	518	for (int i = 0; i < m_numInstances; i++) {
	519	int old_t = tmpT.Pt_x[i];
	520	// If the current cluster only has one instance left, leave it.
	521	if (tmpT.size(old_t) == 1) {
	522	if(m_verbose){
	523	System.out.format("cluster %s has only 1 doc remain\n", old_t);
	524	}
	525	continue;
	526	}
	527	// draw the instance out from its previous cluster
	528	reduce_x(i, old_t, tmpT, input);
	529
	530	// re-cluster the instance
	531	int new_t = clusterInstance(i, input, tmpT);
	532	if (new_t != old_t) {
	533	change++;
	534	updateAssignment(i, new_t, tmpT, input.Px[i], input.Py_x);
	535	}
	536	}
	537
	538	tmpT.counter += change;
	539	if(m_verbose){
	540	System.out.format("iteration %s , changes : %s\n", loopCounter, change);
	541	}
	542	done = checkConvergence(change, loopCounter);
	543	loopCounter++;
	544	}
	545
	546	// compute the sIB score
	547	tmpT.L = sIB_local_MI(tmpT.Py_t, tmpT.Pt);
	548	if(m_verbose){
	549	System.out.format("score (L) : %s \n", Utils.doubleToString(tmpT.L, 4));
	550	}
	551	return tmpT;
	552	}
	553
	554	/**
	555	* Draw a instance out from a cluster.
	556	* @param instIdx index of the instance to be drawn out
	557	* @param t index of the cluster which the instance previously belong to
	558	* @param T the current working partition
	559	* @param input the input statistics
	560	*/
	561	private void reduce_x(int instIdx, int t, Partition T, Input input) {
	562	// Update the prior probability of the cluster
	563	ArrayList<Integer> indices = T.find(t);
	564	double sum = 0.0;
	565	for (int i = 0; i < indices.size(); i++) {
	566	if (indices.get(i) == instIdx)
	567	continue;
	568	sum += input.Px[indices.get(i)];
	569	}
	570	T.Pt[t] = sum;
	571
	572	if (T.Pt[t] < 0) {
	573	System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]);
	574	T.Pt[t] = 0;
	575	}
	576
	577	// Update prob of each attribute in the cluster
	578	double[][] mArray = input.Pyx.getArray();
	579	for (int i = 0; i < m_numAttributes; i++) {
	580	sum = 0.0;
	581	for (int j = 0; j < indices.size(); j++) {
	582	if (indices.get(j) == instIdx)
	583	continue;
	584	sum += mArray[i][indices.get(j)];
	585	}
	586	T.Py_t.set(i, t, sum / T.Pt[t]);
	587	}
	588	}
	589
	590	/**
	591	* Put an instance into a new cluster and update.
	592	* @param instIdx instance to be updated
	593	* @param newt index of the new cluster this instance has been assigned to
	594	* @param T the current working partition
	595	* @param Px an array of prior probabilities of the instances
	596	*/
	597	private void updateAssignment(int instIdx, int newt, Partition T, double Px, Matrix Py_x) {
	598	T.Pt_x[instIdx] = newt;
	599
	600	// update probability of attributes in the cluster
	601	double mass = Px + T.Pt[newt];
	602	double pi1 = Px / mass;
	603	double pi2 = T.Pt[newt] / mass;
	604	for (int i = 0; i < m_numAttributes; i++) {
	605	T.Py_t.set(i, newt, pi1 * Py_x.get(i, instIdx) + pi2 * T.Py_t.get(i, newt));
	606	}
	607
	608	T.Pt[newt] = mass;
	609	}
	610
	611	/**
	612	* Check whether the current iteration is converged
	613	* @param change number of changes in current iteration
	614	* @param loops number of iterations done
	615	* @return true if the iteration is converged, false otherwise
	616	*/
	617	private boolean checkConvergence(int change, int loops) {
	618	if (change <= m_minChange \|\| loops >= m_maxLoop) {
	619	if(m_verbose){
	620	System.out.format("\nsIB converged after %s iterations with %s changes\n", loops,
	621	change);
	622	}
	623	return true;
	624	}
	625	return false;
	626	}
	627
	628	/**
	629	* Cluster an instance into the nearest cluster.
	630	* @param instIdx Index of the instance to be clustered
	631	* @param input Object which describe the statistics of the training dataset
	632	* @param T Partition
	633	* @return index of the cluster that has the minimum distance to the instance
	634	*/
	635	private int clusterInstance(int instIdx, Input input, Partition T) {
	636	double[] distances = new double[m_numCluster];
	637	for (int i = 0; i < m_numCluster; i++) {
	638	double Pnew = input.Px[instIdx] + T.Pt[i];
	639	double pi1 = input.Px[instIdx] / Pnew;
	640	double pi2 = T.Pt[i] / Pnew;
	641	distances[i] = Pnew * JS(instIdx, input, T, i, pi1, pi2);
	642	}
	643	return Utils.minIndex(distances);
	644	}
	645
	646	/**
	647	* Compute the JS divergence between an instance and a cluster, used for training data
	648	* @param instIdx index of the instance
	649	* @param input statistics of the input data
	650	* @param T the whole partition
	651	* @param t index of the cluster
	652	* @param pi1
	653	* @param pi2
	654	* @return the JS divergence
	655	*/
	656	private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) {
	657	if (Math.min(pi1, pi2) <= 0) {
	658	System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
	659	return 0;
	660	}
	661	Instance inst = m_data.instance(instIdx);
	662	double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
	663	for (int i = 0; i < inst.numValues(); i++) {
	664	tmp = input.Py_x.get(inst.index(i), instIdx);
	665	if(tmp != 0) {
	666	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t)));
	667	}
	668	}
	669	for (int i = 0; i < m_numAttributes; i++) {
	670	if ((tmp = T.Py_t.get(i, t)) != 0) {
	671	kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp));
	672	}
	673	}
	674	return pi1 * kl1 + pi2 * kl2;
	675	}
	676
	677	/**
	678	* Compute the JS divergence between an instance and a cluster, used for test data
	679	* @param inst instance to be clustered
	680	* @param t index of the cluster
	681	* @param pi1
	682	* @param pi2
	683	* @return the JS divergence
	684	*/
	685	private double JS(Instance inst, int t, double pi1, double pi2) {
	686	if (Math.min(pi1, pi2) <= 0) {
	687	System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
	688	return 0;
	689	}
	690	double sum = Utils.sum(inst.toDoubleArray());
	691	double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
	692	for (int i = 0; i < inst.numValues(); i++) {
	693	tmp = inst.valueSparse(i) / sum;
	694	if(tmp != 0) {
	695	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t)));
	696	}
	697	}
	698	for (int i = 0; i < m_numAttributes; i++) {
	699	if ((tmp = bestT.Py_t.get(i, t)) != 0) {
	700	kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1 / sum + pi2 * tmp));
	701	}
	702	}
	703	return pi1 * kl1 + pi2 * kl2;
	704	}
	705
	706	/**
	707	* Compute the sIB score
	708	* @param m a term-cluster matrix, with m[i, j] is the probability of term i given cluster j
	709	* @param Pt an array of cluster prior probabilities
	710	* @return the sIB score which indicates the quality of the partition
	711	*/
	712	private double sIB_local_MI(Matrix m, double[] Pt) {
	713	double Hy = 0.0, Ht = 0.0;
	714	for (int i = 0; i < Pt.length; i++) {
	715	Ht += Pt[i] * Math.log(Pt[i]);
	716	}
	717	Ht = -Ht;
	718
	719	for (int i = 0; i < m_numAttributes; i++) {
	720	double Py = 0.0;
	721	for (int j = 0; j < m_numCluster; j++) {
	722	Py += m.get(i, j) * Pt[j];
	723	}
	724	if(Py == 0) continue;
	725	Hy += Py * Math.log(Py);
	726	}
	727	Hy = -Hy;
	728
	729	double Hyt = 0.0, tmp = 0.0;
	730	for (int i = 0; i < m.getRowDimension(); i++) {
	731	for (int j = 0; j < m.getColumnDimension(); j++) {
	732	if ((tmp = m.get(i, j)) == 0 \|\| Pt[j] == 0) {
	733	continue;
	734	}
	735	tmp *= Pt[j];
	736	Hyt += tmp * Math.log(tmp);
	737	}
	738	}
	739	return Hy + Ht + Hyt;
	740	}
	741
	742	/**
	743	* Get the sum of value of the dataset
	744	* @param data set of instances to handle
	745	* @return sum of all the attribute values for all the instances in the dataset
	746	*/
	747	private double getTotalSum(Instances data) {
	748	double sum = 0.0;
	749	for (int i = 0; i < data.numInstances(); i++) {
	750	for (int v = 0; v < data.instance(i).numValues(); v++) {
	751	sum += data.instance(i).valueSparse(v);
	752	}
	753	}
	754	return sum;
	755	}
	756
	757	/**
	758	* Transpose the document-term matrix to term-document matrix
	759	* @param data instances with document-term info
	760	* @return a term-document matrix transposed from the input dataset
	761	*/
	762	private Matrix getTransposedMatrix(Instances data) {
	763	double[][] temp = new double[data.numAttributes()][data.numInstances()];
	764	for (int i = 0; i < data.numInstances(); i++) {
	765	Instance inst = data.instance(i);
	766	for (int v = 0; v < inst.numValues(); v++) {
	767	temp[inst.index(v)][i] = inst.valueSparse(v);
	768	}
	769	}
	770	Matrix My_x = new Matrix(temp);
	771	return My_x;
	772	}
	773
	774	/**
	775	* Normalize the document vectors
	776	* @param data instances to be normalized
	777	*/
	778	private void normalizePrior(Instances data) {
	779	for (int i = 0; i < data.numInstances(); i++) {
	780	normalizeInstance(data.instance(i));
	781	}
	782	}
	783
	784	/**
	785	* Normalize the instance
	786	* @param inst instance to be normalized
	787	* @return a new Instance with normalized values
	788	*/
	789	private Instance normalizeInstance(Instance inst) {
	790	double[] vals = inst.toDoubleArray();
	791	double sum = Utils.sum(vals);
	792	for(int i = 0; i < vals.length; i++) {
	793	vals[i] /= sum;
	794	}
	795	return new DenseInstance(inst.weight(), vals);
	796	}
	797
	798	private Matrix getTransposedNormedMatrix(Instances data) {
	799	Matrix matrix = new Matrix(data.numAttributes(), data.numInstances());
	800	for(int i = 0; i < data.numInstances(); i++){
	801	double[] vals = data.instance(i).toDoubleArray();
	802	double sum = Utils.sum(vals);
	803	for (int v = 0; v < vals.length; v++) {
	804	vals[v] /= sum;
	805	matrix.set(v, i, vals[v]);
	806	}
	807	}
	808	return matrix;
	809	}
	810
	811	/**
	812	* Compute the MI between instances and attributes
	813	* @param m the term-document matrix
	814	* @param input object that describes the statistics about the training data
	815	*/
	816	private void MI(Matrix m, Input input){
	817	int minDimSize = m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension();
	818	if(minDimSize < 2){
	819	System.err.println("Warning : This is not a JOINT distribution");
	820	input.Hx = Entropy (m);
	821	input.Hy = 0;
	822	input.Ixy = 0;
	823	return;
	824	}
	825
	826	input.Hx = Entropy(input.Px);
	827	input.Hy = Entropy(input.Py);
	828
	829	double entropy = input.Hx + input.Hy;
	830	for (int i=0; i < m_numInstances; i++) {
	831	Instance inst = m_data.instance(i);
	832	for (int v = 0; v < inst.numValues(); v++) {
	833	double tmp = m.get(inst.index(v), i);
	834	if(tmp <= 0) continue;
	835	entropy += tmp * Math.log(tmp);
	836	}
	837	}
	838	input.Ixy = entropy;
	839	if(m_verbose) {
	840	System.out.println("Ixy = " + input.Ixy);
	841	}
	842	}
	843
	844	/**
	845	* Compute the entropy score based on an array of probabilities
	846	* @param probs array of non-negative and normalized probabilities
	847	* @return the entropy value
	848	*/
	849	private double Entropy(double[] probs){
	850	for (int i = 0; i < probs.length; i++){
	851	if (probs[i] <= 0) {
	852	if(m_verbose) {
	853	System.out.println("Warning: Negative probability.");
	854	}
	855	return Double.NaN;
	856	}
	857	}
	858	// could be unormalized, when normalization is not specified
	859	if(Math.abs(Utils.sum(probs)-1) >= 1e-6) {
	860	if(m_verbose) {
	861	System.out.println("Warning: Not normalized.");
	862	}
	863	return Double.NaN;
	864	}
	865
	866	double mi = 0.0;
	867	for (int i = 0; i < probs.length; i++) {
	868	mi += probs[i] * Math.log(probs[i]);
	869	}
	870	mi = -mi;
	871	return mi;
	872	}
	873
	874	/**
	875	* Compute the entropy score based on a matrix
	876	* @param p a matrix with non-negative and normalized probabilities
	877	* @return the entropy value
	878	*/
	879	private double Entropy(Matrix p) {
	880	double mi = 0;
	881	for (int i = 0; i < p.getRowDimension(); i++) {
	882	for (int j = 0; j < p.getColumnDimension(); j++) {
	883	if(p.get(i, j) == 0){
	884	continue;
	885	}
	886	mi += p.get(i, j) + Math.log(p.get(i, j));
	887	}
	888	}
	889	mi = -mi;
	890	return mi;
	891	}
	892
	893	/**
	894	* Parses a given list of options. <p/>
	895	*
	896	<!-- options-start -->
	897	* Valid options are: <p/>
	898	*
	899	* <pre> -I <num>
	900	* maximum number of iterations
	901	* (default 100).</pre>
	902	*
	903	* <pre> -M <num>
	904	* minimum number of changes in a single iteration
	905	* (default 0).</pre>
	906	*
	907	* <pre> -N <num>
	908	* number of clusters.
	909	* (default 2).</pre>
	910	*
	911	* <pre> -R <num>
	912	* number of restarts.
	913	* (default 5).</pre>
	914	*
	915	* <pre> -U
	916	* set not to normalize the data
	917	* (default true).</pre>
	918	*
	919	* <pre> -V
	920	* set to output debug info
	921	* (default false).</pre>
	922	*
	923	* <pre> -S <num>
	924	* Random number seed.
	925	* (default 1)</pre>
	926	*
	927	<!-- options-end -->
	928	*
	929	* @param options the list of options as an array of strings
	930	* @throws Exception if an option is not supported
	931	*/
	932	public void setOptions(String[] options) throws Exception {
	933	String optionString = Utils.getOption('I', options);
	934	if (optionString.length() != 0) {
	935	setMaxIterations(Integer.parseInt(optionString));
	936	}
	937	optionString = Utils.getOption('M', options);
	938	if (optionString.length() != 0) {
	939	setMinChange((new Integer(optionString)).intValue());
	940	}
	941	optionString = Utils.getOption('N', options);
	942	if (optionString.length() != 0) {
	943	setNumClusters(Integer.parseInt(optionString));
	944	}
	945	optionString = Utils.getOption('R', options);
	946	if (optionString.length() != 0) {
	947	setNumRestarts((new Integer(optionString)).intValue());
	948	}
	949	setNotUnifyNorm(Utils.getFlag('U', options));
	950	setDebug(Utils.getFlag('V', options));
	951
	952	super.setOptions(options);
	953	}
	954
	955	/**
	956	* Returns an enumeration describing the available options.
	957	* @return an enumeration of all the available options.
	958	*/
	959	public Enumeration listOptions() {
	960	Vector<Option> result = new Vector<Option>();
	961	result.addElement(new Option("\tmaximum number of iterations\n"
	962	+ "\t(default 100).", "I", 1, "-I <num>"));
	963	result.addElement(new Option(
	964	"\tminimum number of changes in a single iteration\n"
	965	+ "\t(default 0).", "M", 1, "-M <num>"));
	966	result.addElement(new Option("\tnumber of clusters.\n" + "\t(default 2).",
	967	"N", 1, "-N <num>"));
	968	result.addElement(new Option("\tnumber of restarts.\n"
	969	+ "\t(default 5).", "R", 1, "-R <num>"));
	970	result.addElement(new Option("\tset not to normalize the data\n"
	971	+ "\t(default true).", "U", 0, "-U"));
	972	result.addElement(new Option("\tset to output debug info\n"
	973	+ "\t(default false).", "V", 0, "-V"));
	974
	975	Enumeration en = super.listOptions();
	976	while (en.hasMoreElements())
	977	result.addElement((Option) en.nextElement());
	978
	979	return result.elements();
	980	}
	981
	982	/**
	983	* Gets the current settings.
	984	* @return an array of strings suitable for passing to setOptions()
	985	*/
	986	public String[] getOptions() {
	987	Vector<String> result;
	988	result = new Vector<String>();
	989	result.add("-I");
	990	result.add("" + getMaxIterations());
	991	result.add("-M");
	992	result.add("" + getMinChange());
	993	result.add("-N");
	994	result.add("" + getNumClusters());
	995	result.add("-R");
	996	result.add("" + getNumRestarts());
	997	if(getNotUnifyNorm()) {
	998	result.add("-U");
	999	}
	1000	if(getDebug()) {
	1001	result.add("-V");
	1002	}
	1003
	1004	String[] options = super.getOptions();
	1005	for (int i = 0; i < options.length; i++){
	1006	result.add(options[i]);
	1007	}
	1008	return result.toArray(new String[result.size()]);
	1009	}
	1010
	1011	/**
	1012	* Returns the tip text for this property
	1013	* @return tip text for this property suitable for
	1014	* displaying in the explorer/experimenter gui
	1015	*/
	1016	public String debugTipText() {
	1017	return "If set to true, clusterer may output additional info to " +
	1018	"the console.";
	1019	}
	1020
	1021	/**
	1022	* Set debug mode - verbose output
	1023	* @param v true for verbose output
	1024	*/
	1025	public void setDebug (boolean v) {
	1026	m_verbose = v;
	1027	}
	1028
	1029	/**
	1030	* Get debug mode
	1031	* @return true if debug mode is set
	1032	*/
	1033	public boolean getDebug () {
	1034	return m_verbose;
	1035	}
	1036
	1037	/**
	1038	* Returns the tip text for this property.
	1039	* @return tip text for this property
	1040	*/
	1041	public String maxIterationsTipText() {
	1042	return "set maximum number of iterations (default 100)";
	1043	}
	1044
	1045	/**
	1046	* Set the max number of iterations
	1047	* @param i max number of iterations
	1048	*/
	1049	public void setMaxIterations(int i) {
	1050	m_maxLoop = i;
	1051	}
	1052
	1053	/**
	1054	* Get the max number of iterations
	1055	* @return max number of iterations
	1056	*/
	1057	public int getMaxIterations() {
	1058	return m_maxLoop;
	1059	}
	1060
	1061	/**
	1062	* Returns the tip text for this property.
	1063	* @return tip text for this property
	1064	*/
	1065	public String minChangeTipText() {
	1066	return "set minimum number of changes (default 0)";
	1067	}
	1068
	1069	/**
	1070	* set the minimum number of changes
	1071	* @param m the minimum number of changes
	1072	*/
	1073	public void setMinChange(int m) {
	1074	m_minChange = m;
	1075	}
	1076
	1077	/**
	1078	* get the minimum number of changes
	1079	* @return the minimum number of changes
	1080	*/
	1081	public int getMinChange() {
	1082	return m_minChange;
	1083	}
	1084
	1085	/**
	1086	* Returns the tip text for this property.
	1087	* @return tip text for this property
	1088	*/
	1089	public String numClustersTipText() {
	1090	return "set number of clusters (default 2)";
	1091	}
	1092
	1093	/**
	1094	* Set the number of clusters
	1095	* @param n number of clusters
	1096	*/
	1097	public void setNumClusters(int n) {
	1098	m_numCluster = n;
	1099	}
	1100
	1101	/**
	1102	* Get the number of clusters
	1103	* @return the number of clusters
	1104	*/
	1105	public int getNumClusters() {
	1106	return m_numCluster;
	1107	}
	1108
	1109	/**
	1110	* Get the number of clusters
	1111	* @return the number of clusters
	1112	*/
	1113	public int numberOfClusters() {
	1114	return m_numCluster;
	1115	}
	1116
	1117	/**
	1118	* Returns the tip text for this property.
	1119	* @return tip text for this property
	1120	*/
	1121	public String numRestartsTipText() {
	1122	return "set number of restarts (default 5)";
	1123	}
	1124
	1125	/**
	1126	* Set the number of restarts
	1127	* @param i number of restarts
	1128	*/
	1129	public void setNumRestarts(int i) {
	1130	m_numRestarts = i;
	1131	}
	1132
	1133	/**
	1134	* Get the number of restarts
	1135	* @return number of restarts
	1136	*/
	1137	public int getNumRestarts(){
	1138	return m_numRestarts;
	1139	}
	1140
	1141	/**
	1142	* Returns the tip text for this property.
	1143	* @return tip text for this property
	1144	*/
	1145	public String notUnifyNormTipText() {
	1146	return "set whether to normalize each instance to a unify prior probability (eg. 1).";
	1147	}
	1148
	1149	/**
	1150	* Set whether to normalize instances to unify prior probability
	1151	* before building the clusterer
	1152	* @param b true to normalize, otherwise false
	1153	*/
	1154	public void setNotUnifyNorm(boolean b){
	1155	m_uniformPrior = !b;
	1156	}
	1157
	1158	/**
	1159	* Get whether to normalize instances to unify prior probability
	1160	* before building the clusterer
	1161	* @return true if set to normalize, false otherwise
	1162	*/
	1163	public boolean getNotUnifyNorm() {
	1164	return !m_uniformPrior;
	1165	}
	1166
	1167	/**
	1168	* Returns a string describing this clusterer
	1169	* @return a description of the clusterer suitable for
	1170	* displaying in the explorer/experimenter gui
	1171	*/
	1172	public String globalInfo() {
	1173	return "Cluster data using the sequential information bottleneck algorithm.\n\n" +
	1174	"Note: only hard clustering scheme is supported. sIB assign for each " +
	1175	"instance the cluster that have the minimum cost/distance to the instance. " +
	1176	"The trade-off beta is set to infinite so 1/beta is zero.\n\n" +
	1177	"For more information, see:\n\n"
	1178	+getTechnicalInformation().toString();
	1179	}
	1180
	1181	/**
	1182	* Returns an instance of a TechnicalInformation object, containing
	1183	* detailed information about the technical background of this class,
	1184	* e.g., paper reference or book this class is based on.
	1185	* @return the technical information about this class
	1186	*/
	1187	public TechnicalInformation getTechnicalInformation() {
	1188	TechnicalInformation result;
	1189
	1190	result = new TechnicalInformation(Type.INPROCEEDINGS);
	1191	result.setValue(Field.AUTHOR, "Noam Slonim and Nir Friedman and Naftali Tishby");
	1192	result.setValue(Field.YEAR, "2002");
	1193	result.setValue(Field.TITLE, "Unsupervised document classification using sequential information maximization");
	1194	result.setValue(Field.BOOKTITLE, "Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval");
	1195	result.setValue(Field.PAGES, "129-136");
	1196
	1197	return result;
	1198	}
	1199
	1200	/**
	1201	* Returns default capabilities of the clusterer.
	1202	* @return the capabilities of this clusterer
	1203	*/
	1204	public Capabilities getCapabilities() {
	1205	Capabilities result = super.getCapabilities();
	1206	result.disableAll();
	1207	result.enable(Capability.NO_CLASS);
	1208
	1209	// attributes
	1210	result.enable(Capability.NUMERIC_ATTRIBUTES);
	1211	return result;
	1212	}
	1213
	1214	public String toString(){
	1215	StringBuffer text = new StringBuffer();
	1216	text.append("\nsIB\n===\n");
	1217	text.append("\nNumber of clusters: " + m_numCluster + "\n");
	1218
	1219	for (int j = 0; j < m_numCluster; j++) {
	1220	text.append("\nCluster: " + j + " Size : " + bestT.size(j) + " Prior probability: "
	1221	+ Utils.doubleToString(bestT.Pt[j], 4) + "\n\n");
	1222	for (int i = 0; i < m_numAttributes; i++) {
	1223	text.append("Attribute: " + m_data.attribute(i).name() + "\n");
	1224	text.append("Probability given the cluster = "
	1225	+ Utils.doubleToString(bestT.Py_t.get(i, j), 4)
	1226	+ "\n");
	1227	}
	1228	}
	1229	return text.toString();
	1230	}
	1231
	1232	/**
	1233	* Returns the revision string.
	1234	*
	1235	* @return the revision
	1236	*/
	1237	public String getRevision() {
	1238	return RevisionUtils.extract("$Revision: 5987 $");
	1239	}
	1240
	1241	public static void main(String[] argv) {
	1242	runClusterer(new sIB(), argv);
	1243	}
	1244	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: