Context Navigation

source: src/main/java/weka/clusterers/sIB.java @ 25

Last change on this file since 25 was 4, checked in by gnappo, 14 years ago
Import di weka.
File size: 35.2 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* sIB.java
19	* Copyright (C) 2008 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23	package weka.clusterers;
24
25	import weka.core.Capabilities;
26	import weka.core.Instance;
27	import weka.core.DenseInstance;
28	import weka.core.Instances;
29	import weka.core.Option;
30	import weka.core.RevisionHandler;
31	import weka.core.RevisionUtils;
32	import weka.core.TechnicalInformation;
33	import weka.core.TechnicalInformationHandler;
34	import weka.core.Utils;
35	import weka.core.Capabilities.Capability;
36	import weka.core.TechnicalInformation.Field;
37	import weka.core.TechnicalInformation.Type;
38	import weka.core.matrix.Matrix;
39	import weka.filters.unsupervised.attribute.ReplaceMissingValues;
40
41	import java.io.Serializable;
42	import java.util.ArrayList;
43	import java.util.Enumeration;
44	import java.util.Random;
45	import java.util.Vector;
46
47	/**
48	<!-- globalinfo-start -->
49	* Cluster data using the sequential information bottleneck algorithm.<br/>
50	* <br/>
51	* Note: only hard clustering scheme is supported. sIB assign for each instance the cluster that have the minimum cost/distance to the instance. The trade-off beta is set to infinite so 1/beta is zero.<br/>
52	* <br/>
53	* For more information, see:<br/>
54	* <br/>
55	* Noam Slonim, Nir Friedman, Naftali Tishby: Unsupervised document classification using sequential information maximization. In: Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval, 129-136, 2002.
56	* <p/>
57	<!-- globalinfo-end -->
58	*
59	<!-- technical-bibtex-start -->
60	* BibTeX:
61	* <pre>
62	* @inproceedings{Slonim2002,
63	* author = {Noam Slonim and Nir Friedman and Naftali Tishby},
64	* booktitle = {Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval},
65	* pages = {129-136},
66	* title = {Unsupervised document classification using sequential information maximization},
67	* year = {2002}
68	* }
69	* </pre>
70	* <p/>
71	<!-- technical-bibtex-end -->
72	*
73	<!-- options-start -->
74	* Valid options are: <p/>
75	*
76	* <pre> -I <num>
77	* maximum number of iterations
78	* (default 100).</pre>
79	*
80	* <pre> -M <num>
81	* minimum number of changes in a single iteration
82	* (default 0).</pre>
83	*
84	* <pre> -N <num>
85	* number of clusters.
86	* (default 2).</pre>
87	*
88	* <pre> -R <num>
89	* number of restarts.
90	* (default 5).</pre>
91	*
92	* <pre> -U
93	* set not to normalize the data
94	* (default true).</pre>
95	*
96	* <pre> -V
97	* set to output debug info
98	* (default false).</pre>
99	*
100	* <pre> -S <num>
101	* Random number seed.
102	* (default 1)</pre>
103	*
104	<!-- options-end -->
105	*
106	* @author Noam Slonim
107	* @author <a href="mailto:lh92@cs.waikato.ac.nz">Anna Huang</a>
108	* @version $Revision: 5987 $
109	*/
110	public class sIB
111	extends RandomizableClusterer
112	implements TechnicalInformationHandler {
113
114	/** for serialization. */
115	private static final long serialVersionUID = -8652125897352654213L;
116
117	/**
118	* Inner class handling status of the input data
119	*
120	* @see Serializable
121	*/
122	private class Input
123	implements Serializable, RevisionHandler {
124
125	/** for serialization */
126	static final long serialVersionUID = -2464453171263384037L;
127
128	/** Prior probability of each instance */
129	private double[] Px;
130
131	/** Prior probability of each attribute */
132	private double[] Py;
133
134	/** Joint distribution of attribute and instance */
135	private Matrix Pyx;
136
137	/** P[y\|x] */
138	private Matrix Py_x;
139
140	/** Mutual information between the instances and the attributes */
141	private double Ixy;
142
143	/** Entropy of the attributes */
144	private double Hy;
145
146	/** Entropy of the instances */
147	private double Hx;
148
149	/** Sum values of the dataset */
150	private double sumVals;
151
152	/**
153	* Returns the revision string.
154	*
155	* @return the revision
156	*/
157	public String getRevision() {
158	return RevisionUtils.extract("$Revision: 5987 $");
159	}
160	}
161
162	/**
163	* Internal class handling the whole partition
164	*
165	* @see Serializable
166	*/
167	private class Partition
168	implements Serializable, RevisionHandler {
169
170	/** for serialization */
171	static final long serialVersionUID = 4957194978951259946L;
172
173	/** Cluster assignment for each instance */
174	private int[] Pt_x;
175
176	/** Prior probability of each cluster */
177	private double[] Pt;
178
179	/** sIB equation score, to evaluate the quality of the partition */
180	private double L;
181
182	/** Number of changes during the generation of this partition */
183	private int counter;
184
185	/** Attribute probablities for each cluster */
186	private Matrix Py_t;
187
188	/**
189	* Create a new empty <code>Partition</code> instance.
190	*/
191	public Partition() {
192	Pt_x = new int[m_numInstances];
193	for (int i = 0; i < m_numInstances; i++) {
194	Pt_x[i] = -1;
195	}
196	Pt = new double[m_numCluster];
197	Py_t = new Matrix(m_numAttributes, m_numCluster);
198	counter = 0;
199	}
200
201	/**
202	* Find all the instances that have been assigned to cluster i
203	* @param i index of the cluster
204	* @return an arraylist of the instance ids that have been assigned to cluster i
205	*/
206	private ArrayList<Integer> find(int i) {
207	ArrayList<Integer> indices = new ArrayList<Integer>();
208	for (int x = 0; x < Pt_x.length; x++) {
209	if (Pt_x[x] == i) {
210	indices.add(x);
211	}
212	}
213	return indices;
214	}
215
216	/**
217	* Find the size of the cluster i
218	* @param i index of the cluster
219	* @return the size of cluster i
220	*/
221	private int size(int i) {
222	int count = 0;
223	for (int x = 0; x < Pt_x.length; x++) {
224	if (Pt_x[x] == i) {
225	count++;
226	}
227	}
228	return count;
229	}
230
231	/**
232	* Copy the current partition into T
233	* @param T the target partition object
234	*/
235	private void copy(Partition T) {
236	if (T == null) {
237	T = new Partition();
238	}
239	System.arraycopy(Pt_x, 0, T.Pt_x, 0, Pt_x.length);
240	System.arraycopy(Pt, 0, T.Pt, 0, Pt.length);
241	T.L = L;
242	T.counter = counter;
243
244	double[][] mArray = Py_t.getArray();
245	double[][] tgtArray = T.Py_t.getArray();
246	for (int i = 0; i < mArray.length; i++) {
247	System.arraycopy(mArray[i], 0, tgtArray[i], 0, mArray[0].length);
248	}
249	}
250
251	/**
252	* Output the current partition
253	* @param insts
254	* @return a string that describes the partition
255	*/
256	public String toString() {
257	StringBuffer text = new StringBuffer();
258	text.append("score (L) : " + Utils.doubleToString(L, 4) + "\n");
259	text.append("number of changes : " + counter +"\n");
260	for (int i = 0; i < m_numCluster; i++) {
261	text.append("\nCluster "+i+"\n");
262	text.append("size : "+size(i)+"\n");
263	text.append("prior prob : "+Utils.doubleToString(Pt[i], 4)+"\n");
264	}
265	return text.toString();
266	}
267
268	/**
269	* Returns the revision string.
270	*
271	* @return the revision
272	*/
273	public String getRevision() {
274	return RevisionUtils.extract("$Revision: 5987 $");
275	}
276	}
277
278	/** Training data */
279	private Instances m_data;
280
281	/** Number of clusters */
282	private int m_numCluster = 2;
283
284	/** Number of restarts */
285	private int m_numRestarts = 5;
286
287	/** Verbose? */
288	private boolean m_verbose = false;
289
290	/** Uniform prior probability of the documents */
291	private boolean m_uniformPrior = true;
292
293	/** Max number of iterations during each restart */
294	private int m_maxLoop = 100;
295
296	/** Minimum number of changes */
297	private int m_minChange = 0;
298
299	/** Globally replace missing values */
300	private ReplaceMissingValues m_replaceMissing;
301
302	/** Number of instances */
303	private int m_numInstances;
304
305	/** Number of attributes */
306	private int m_numAttributes;
307
308	/** Randomly generate initial partition */
309	private Random random;
310
311	/** Holds the best partition built */
312	private Partition bestT;
313
314	/** Holds the statistics about the input dataset */
315	private Input input;
316
317	/**
318	* Generates a clusterer.
319	*
320	* @param data the training instances
321	* @throws Exception if something goes wrong
322	*/
323	public void buildClusterer(Instances data) throws Exception {
324	// can clusterer handle the data ?
325	getCapabilities().testWithFail(data);
326
327	m_replaceMissing = new ReplaceMissingValues();
328	Instances instances = new Instances(data);
329	instances.setClassIndex(-1);
330	m_replaceMissing.setInputFormat(instances);
331	data = weka.filters.Filter.useFilter(instances, m_replaceMissing);
332	instances = null;
333
334	// initialize all fields that are not being set via options
335	m_data = data;
336	m_numInstances = m_data.numInstances();
337	m_numAttributes = m_data.numAttributes();
338	random = new Random(getSeed());
339
340	// initialize the statistics of the input training data
341	input = sIB_ProcessInput();
342
343	// object to hold the best partition
344	bestT = new Partition();
345
346	// the real clustering
347	double bestL = Double.NEGATIVE_INFINITY;
348	for (int k = 0; k < m_numRestarts; k++) {
349	if(m_verbose) {
350	System.out.format("restart number %s...\n", k);
351	}
352
353	// initialize the partition and optimize it
354	Partition tmpT = sIB_InitT(input);
355	tmpT = sIB_OptimizeT(tmpT, input);
356
357	// if a better partition is found, save it
358	if (tmpT.L > bestL) {
359	tmpT.copy(bestT);
360	bestL = bestT.L;
361	}
362
363	if(m_verbose) {
364	System.out.println("\nPartition status : ");
365	System.out.println("------------------");
366	System.out.println(tmpT.toString()+"\n");
367	}
368	}
369
370	if(m_verbose){
371	System.out.println("\nBest Partition");
372	System.out.println("===============");
373	System.out.println(bestT.toString());
374	}
375
376	// save memory
377	m_data = new Instances(m_data, 0);
378	}
379
380	/**
381	* Cluster a given instance, this is the method defined in Clusterer
382	* interface do nothing but just return the cluster assigned to it
383	*/
384	public int clusterInstance(Instance instance) throws Exception {
385	double prior = (double) 1 / input.sumVals;
386	double[] distances = new double[m_numCluster];
387	for(int i = 0; i < m_numCluster; i++){
388	double Pnew = bestT.Pt[i] + prior;
389	double pi1 = prior / Pnew;
390	double pi2 = bestT.Pt[i] / Pnew;
391	distances[i] = Pnew * JS(instance, i, pi1, pi2);
392	}
393	return Utils.minIndex(distances);
394	}
395
396	/**
397	* Process the input and compute the statistics of the training data
398	* @return an Input object which holds the statistics about the training data
399	*/
400	private Input sIB_ProcessInput() {
401	double valSum = 0.0;
402	for (int i = 0; i < m_numInstances; i++) {
403	valSum = 0.0;
404	for (int v = 0; v < m_data.instance(i).numValues(); v++) {
405	valSum += m_data.instance(i).valueSparse(v);
406	}
407	if (valSum <= 0) {
408	if(m_verbose){
409	System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum);
410	}
411	m_data.delete(i);
412	m_numInstances--;
413	}
414	}
415
416	// get the term-document matrix
417	Input input = new Input();
418	input.Py_x = getTransposedNormedMatrix(m_data);
419	if (m_uniformPrior) {
420	input.Pyx = input.Py_x.copy();
421	normalizePrior(m_data);
422	}
423	else {
424	input.Pyx = getTransposedMatrix(m_data);
425	}
426	input.sumVals = getTotalSum(m_data);
427	input.Pyx.timesEquals((double) 1 / input.sumVals);
428
429	// prior probability of documents, ie. sum the columns from the Pyx matrix
430	input.Px = new double[m_numInstances];
431	for (int i = 0; i < m_numInstances; i++) {
432	for (int j = 0; j < m_numAttributes; j++) {
433	input.Px[i] += input.Pyx.get(j, i);
434	}
435	}
436
437	// prior probability of terms, ie. sum the rows from the Pyx matrix
438	input.Py = new double[m_numAttributes];
439	for (int i = 0; i < input.Pyx.getRowDimension(); i++) {
440	for (int j = 0; j < input.Pyx.getColumnDimension(); j++) {
441	input.Py[i] += input.Pyx.get(i, j);
442	}
443	}
444
445	MI(input.Pyx, input);
446	return input;
447	}
448
449	/**
450	* Initialize the partition
451	* @param input object holding the statistics of the training data
452	* @return the initialized partition
453	*/
454	private Partition sIB_InitT(Input input) {
455	Partition T = new Partition();
456	int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster);
457
458	ArrayList<Integer> permInstsIdx = new ArrayList<Integer>();
459	ArrayList<Integer> unassigned = new ArrayList<Integer>();
460	for (int i = 0; i < m_numInstances; i++) {
461	unassigned.add(i);
462	}
463	while (unassigned.size() != 0) {
464	int t = random.nextInt(unassigned.size());
465	permInstsIdx.add(unassigned.get(t));
466	unassigned.remove(t);
467	}
468
469	for (int i = 0; i < m_numCluster; i++) {
470	int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize;
471	for (int j = 0; j < r2; j++) {
472	T.Pt_x[permInstsIdx.get(j)] = i;
473	}
474	for (int j = 0; j < r2; j++) {
475	permInstsIdx.remove(0);
476	}
477	}
478
479	// initialize the prior prob of each cluster, and the probability
480	// for each attribute within the cluster
481	for (int i = 0; i < m_numCluster; i++) {
482	ArrayList<Integer> indices = T.find(i);
483	for (int j = 0; j < indices.size(); j++) {
484	T.Pt[i] += input.Px[indices.get(j)];
485	}
486	double[][] mArray = input.Pyx.getArray();
487	for (int j = 0; j < m_numAttributes; j++) {
488	double sum = 0.0;
489	for (int k = 0; k < indices.size(); k++) {
490	sum += mArray[j][indices.get(k)];
491	}
492	sum /= T.Pt[i];
493	T.Py_t.set(j, i, sum);
494	}
495	}
496
497	if(m_verbose) {
498	System.out.println("Initializing...");
499	}
500	return T;
501	}
502
503	/**
504	* Optimize the partition
505	* @param tmpT partition to be optimized
506	* @param input object describing the statistics of the training dataset
507	* @return the optimized partition
508	*/
509	private Partition sIB_OptimizeT(Partition tmpT, Input input) {
510	boolean done = false;
511	int change = 0, loopCounter = 0;
512	if(m_verbose) {
513	System.out.println("Optimizing...");
514	System.out.println("-------------");
515	}
516	while (!done) {
517	change = 0;
518	for (int i = 0; i < m_numInstances; i++) {
519	int old_t = tmpT.Pt_x[i];
520	// If the current cluster only has one instance left, leave it.
521	if (tmpT.size(old_t) == 1) {
522	if(m_verbose){
523	System.out.format("cluster %s has only 1 doc remain\n", old_t);
524	}
525	continue;
526	}
527	// draw the instance out from its previous cluster
528	reduce_x(i, old_t, tmpT, input);
529
530	// re-cluster the instance
531	int new_t = clusterInstance(i, input, tmpT);
532	if (new_t != old_t) {
533	change++;
534	updateAssignment(i, new_t, tmpT, input.Px[i], input.Py_x);
535	}
536	}
537
538	tmpT.counter += change;
539	if(m_verbose){
540	System.out.format("iteration %s , changes : %s\n", loopCounter, change);
541	}
542	done = checkConvergence(change, loopCounter);
543	loopCounter++;
544	}
545
546	// compute the sIB score
547	tmpT.L = sIB_local_MI(tmpT.Py_t, tmpT.Pt);
548	if(m_verbose){
549	System.out.format("score (L) : %s \n", Utils.doubleToString(tmpT.L, 4));
550	}
551	return tmpT;
552	}
553
554	/**
555	* Draw a instance out from a cluster.
556	* @param instIdx index of the instance to be drawn out
557	* @param t index of the cluster which the instance previously belong to
558	* @param T the current working partition
559	* @param input the input statistics
560	*/
561	private void reduce_x(int instIdx, int t, Partition T, Input input) {
562	// Update the prior probability of the cluster
563	ArrayList<Integer> indices = T.find(t);
564	double sum = 0.0;
565	for (int i = 0; i < indices.size(); i++) {
566	if (indices.get(i) == instIdx)
567	continue;
568	sum += input.Px[indices.get(i)];
569	}
570	T.Pt[t] = sum;
571
572	if (T.Pt[t] < 0) {
573	System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]);
574	T.Pt[t] = 0;
575	}
576
577	// Update prob of each attribute in the cluster
578	double[][] mArray = input.Pyx.getArray();
579	for (int i = 0; i < m_numAttributes; i++) {
580	sum = 0.0;
581	for (int j = 0; j < indices.size(); j++) {
582	if (indices.get(j) == instIdx)
583	continue;
584	sum += mArray[i][indices.get(j)];
585	}
586	T.Py_t.set(i, t, sum / T.Pt[t]);
587	}
588	}
589
590	/**
591	* Put an instance into a new cluster and update.
592	* @param instIdx instance to be updated
593	* @param newt index of the new cluster this instance has been assigned to
594	* @param T the current working partition
595	* @param Px an array of prior probabilities of the instances
596	*/
597	private void updateAssignment(int instIdx, int newt, Partition T, double Px, Matrix Py_x) {
598	T.Pt_x[instIdx] = newt;
599
600	// update probability of attributes in the cluster
601	double mass = Px + T.Pt[newt];
602	double pi1 = Px / mass;
603	double pi2 = T.Pt[newt] / mass;
604	for (int i = 0; i < m_numAttributes; i++) {
605	T.Py_t.set(i, newt, pi1 * Py_x.get(i, instIdx) + pi2 * T.Py_t.get(i, newt));
606	}
607
608	T.Pt[newt] = mass;
609	}
610
611	/**
612	* Check whether the current iteration is converged
613	* @param change number of changes in current iteration
614	* @param loops number of iterations done
615	* @return true if the iteration is converged, false otherwise
616	*/
617	private boolean checkConvergence(int change, int loops) {
618	if (change <= m_minChange \|\| loops >= m_maxLoop) {
619	if(m_verbose){
620	System.out.format("\nsIB converged after %s iterations with %s changes\n", loops,
621	change);
622	}
623	return true;
624	}
625	return false;
626	}
627
628	/**
629	* Cluster an instance into the nearest cluster.
630	* @param instIdx Index of the instance to be clustered
631	* @param input Object which describe the statistics of the training dataset
632	* @param T Partition
633	* @return index of the cluster that has the minimum distance to the instance
634	*/
635	private int clusterInstance(int instIdx, Input input, Partition T) {
636	double[] distances = new double[m_numCluster];
637	for (int i = 0; i < m_numCluster; i++) {
638	double Pnew = input.Px[instIdx] + T.Pt[i];
639	double pi1 = input.Px[instIdx] / Pnew;
640	double pi2 = T.Pt[i] / Pnew;
641	distances[i] = Pnew * JS(instIdx, input, T, i, pi1, pi2);
642	}
643	return Utils.minIndex(distances);
644	}
645
646	/**
647	* Compute the JS divergence between an instance and a cluster, used for training data
648	* @param instIdx index of the instance
649	* @param input statistics of the input data
650	* @param T the whole partition
651	* @param t index of the cluster
652	* @param pi1
653	* @param pi2
654	* @return the JS divergence
655	*/
656	private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) {
657	if (Math.min(pi1, pi2) <= 0) {
658	System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
659	return 0;
660	}
661	Instance inst = m_data.instance(instIdx);
662	double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
663	for (int i = 0; i < inst.numValues(); i++) {
664	tmp = input.Py_x.get(inst.index(i), instIdx);
665	if(tmp != 0) {
666	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t)));
667	}
668	}
669	for (int i = 0; i < m_numAttributes; i++) {
670	if ((tmp = T.Py_t.get(i, t)) != 0) {
671	kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp));
672	}
673	}
674	return pi1 * kl1 + pi2 * kl2;
675	}
676
677	/**
678	* Compute the JS divergence between an instance and a cluster, used for test data
679	* @param inst instance to be clustered
680	* @param t index of the cluster
681	* @param pi1
682	* @param pi2
683	* @return the JS divergence
684	*/
685	private double JS(Instance inst, int t, double pi1, double pi2) {
686	if (Math.min(pi1, pi2) <= 0) {
687	System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2);
688	return 0;
689	}
690	double sum = Utils.sum(inst.toDoubleArray());
691	double kl1 = 0.0, kl2 = 0.0, tmp = 0.0;
692	for (int i = 0; i < inst.numValues(); i++) {
693	tmp = inst.valueSparse(i) / sum;
694	if(tmp != 0) {
695	kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t)));
696	}
697	}
698	for (int i = 0; i < m_numAttributes; i++) {
699	if ((tmp = bestT.Py_t.get(i, t)) != 0) {
700	kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1 / sum + pi2 * tmp));
701	}
702	}
703	return pi1 * kl1 + pi2 * kl2;
704	}
705
706	/**
707	* Compute the sIB score
708	* @param m a term-cluster matrix, with m[i, j] is the probability of term i given cluster j
709	* @param Pt an array of cluster prior probabilities
710	* @return the sIB score which indicates the quality of the partition
711	*/
712	private double sIB_local_MI(Matrix m, double[] Pt) {
713	double Hy = 0.0, Ht = 0.0;
714	for (int i = 0; i < Pt.length; i++) {
715	Ht += Pt[i] * Math.log(Pt[i]);
716	}
717	Ht = -Ht;
718
719	for (int i = 0; i < m_numAttributes; i++) {
720	double Py = 0.0;
721	for (int j = 0; j < m_numCluster; j++) {
722	Py += m.get(i, j) * Pt[j];
723	}
724	if(Py == 0) continue;
725	Hy += Py * Math.log(Py);
726	}
727	Hy = -Hy;
728
729	double Hyt = 0.0, tmp = 0.0;
730	for (int i = 0; i < m.getRowDimension(); i++) {
731	for (int j = 0; j < m.getColumnDimension(); j++) {
732	if ((tmp = m.get(i, j)) == 0 \|\| Pt[j] == 0) {
733	continue;
734	}
735	tmp *= Pt[j];
736	Hyt += tmp * Math.log(tmp);
737	}
738	}
739	return Hy + Ht + Hyt;
740	}
741
742	/**
743	* Get the sum of value of the dataset
744	* @param data set of instances to handle
745	* @return sum of all the attribute values for all the instances in the dataset
746	*/
747	private double getTotalSum(Instances data) {
748	double sum = 0.0;
749	for (int i = 0; i < data.numInstances(); i++) {
750	for (int v = 0; v < data.instance(i).numValues(); v++) {
751	sum += data.instance(i).valueSparse(v);
752	}
753	}
754	return sum;
755	}
756
757	/**
758	* Transpose the document-term matrix to term-document matrix
759	* @param data instances with document-term info
760	* @return a term-document matrix transposed from the input dataset
761	*/
762	private Matrix getTransposedMatrix(Instances data) {
763	double[][] temp = new double[data.numAttributes()][data.numInstances()];
764	for (int i = 0; i < data.numInstances(); i++) {
765	Instance inst = data.instance(i);
766	for (int v = 0; v < inst.numValues(); v++) {
767	temp[inst.index(v)][i] = inst.valueSparse(v);
768	}
769	}
770	Matrix My_x = new Matrix(temp);
771	return My_x;
772	}
773
774	/**
775	* Normalize the document vectors
776	* @param data instances to be normalized
777	*/
778	private void normalizePrior(Instances data) {
779	for (int i = 0; i < data.numInstances(); i++) {
780	normalizeInstance(data.instance(i));
781	}
782	}
783
784	/**
785	* Normalize the instance
786	* @param inst instance to be normalized
787	* @return a new Instance with normalized values
788	*/
789	private Instance normalizeInstance(Instance inst) {
790	double[] vals = inst.toDoubleArray();
791	double sum = Utils.sum(vals);
792	for(int i = 0; i < vals.length; i++) {
793	vals[i] /= sum;
794	}
795	return new DenseInstance(inst.weight(), vals);
796	}
797
798	private Matrix getTransposedNormedMatrix(Instances data) {
799	Matrix matrix = new Matrix(data.numAttributes(), data.numInstances());
800	for(int i = 0; i < data.numInstances(); i++){
801	double[] vals = data.instance(i).toDoubleArray();
802	double sum = Utils.sum(vals);
803	for (int v = 0; v < vals.length; v++) {
804	vals[v] /= sum;
805	matrix.set(v, i, vals[v]);
806	}
807	}
808	return matrix;
809	}
810
811	/**
812	* Compute the MI between instances and attributes
813	* @param m the term-document matrix
814	* @param input object that describes the statistics about the training data
815	*/
816	private void MI(Matrix m, Input input){
817	int minDimSize = m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension();
818	if(minDimSize < 2){
819	System.err.println("Warning : This is not a JOINT distribution");
820	input.Hx = Entropy (m);
821	input.Hy = 0;
822	input.Ixy = 0;
823	return;
824	}
825
826	input.Hx = Entropy(input.Px);
827	input.Hy = Entropy(input.Py);
828
829	double entropy = input.Hx + input.Hy;
830	for (int i=0; i < m_numInstances; i++) {
831	Instance inst = m_data.instance(i);
832	for (int v = 0; v < inst.numValues(); v++) {
833	double tmp = m.get(inst.index(v), i);
834	if(tmp <= 0) continue;
835	entropy += tmp * Math.log(tmp);
836	}
837	}
838	input.Ixy = entropy;
839	if(m_verbose) {
840	System.out.println("Ixy = " + input.Ixy);
841	}
842	}
843
844	/**
845	* Compute the entropy score based on an array of probabilities
846	* @param probs array of non-negative and normalized probabilities
847	* @return the entropy value
848	*/
849	private double Entropy(double[] probs){
850	for (int i = 0; i < probs.length; i++){
851	if (probs[i] <= 0) {
852	if(m_verbose) {
853	System.out.println("Warning: Negative probability.");
854	}
855	return Double.NaN;
856	}
857	}
858	// could be unormalized, when normalization is not specified
859	if(Math.abs(Utils.sum(probs)-1) >= 1e-6) {
860	if(m_verbose) {
861	System.out.println("Warning: Not normalized.");
862	}
863	return Double.NaN;
864	}
865
866	double mi = 0.0;
867	for (int i = 0; i < probs.length; i++) {
868	mi += probs[i] * Math.log(probs[i]);
869	}
870	mi = -mi;
871	return mi;
872	}
873
874	/**
875	* Compute the entropy score based on a matrix
876	* @param p a matrix with non-negative and normalized probabilities
877	* @return the entropy value
878	*/
879	private double Entropy(Matrix p) {
880	double mi = 0;
881	for (int i = 0; i < p.getRowDimension(); i++) {
882	for (int j = 0; j < p.getColumnDimension(); j++) {
883	if(p.get(i, j) == 0){
884	continue;
885	}
886	mi += p.get(i, j) + Math.log(p.get(i, j));
887	}
888	}
889	mi = -mi;
890	return mi;
891	}
892
893	/**
894	* Parses a given list of options. <p/>
895	*
896	<!-- options-start -->
897	* Valid options are: <p/>
898	*
899	* <pre> -I <num>
900	* maximum number of iterations
901	* (default 100).</pre>
902	*
903	* <pre> -M <num>
904	* minimum number of changes in a single iteration
905	* (default 0).</pre>
906	*
907	* <pre> -N <num>
908	* number of clusters.
909	* (default 2).</pre>
910	*
911	* <pre> -R <num>
912	* number of restarts.
913	* (default 5).</pre>
914	*
915	* <pre> -U
916	* set not to normalize the data
917	* (default true).</pre>
918	*
919	* <pre> -V
920	* set to output debug info
921	* (default false).</pre>
922	*
923	* <pre> -S <num>
924	* Random number seed.
925	* (default 1)</pre>
926	*
927	<!-- options-end -->
928	*
929	* @param options the list of options as an array of strings
930	* @throws Exception if an option is not supported
931	*/
932	public void setOptions(String[] options) throws Exception {
933	String optionString = Utils.getOption('I', options);
934	if (optionString.length() != 0) {
935	setMaxIterations(Integer.parseInt(optionString));
936	}
937	optionString = Utils.getOption('M', options);
938	if (optionString.length() != 0) {
939	setMinChange((new Integer(optionString)).intValue());
940	}
941	optionString = Utils.getOption('N', options);
942	if (optionString.length() != 0) {
943	setNumClusters(Integer.parseInt(optionString));
944	}
945	optionString = Utils.getOption('R', options);
946	if (optionString.length() != 0) {
947	setNumRestarts((new Integer(optionString)).intValue());
948	}
949	setNotUnifyNorm(Utils.getFlag('U', options));
950	setDebug(Utils.getFlag('V', options));
951
952	super.setOptions(options);
953	}
954
955	/**
956	* Returns an enumeration describing the available options.
957	* @return an enumeration of all the available options.
958	*/
959	public Enumeration listOptions() {
960	Vector<Option> result = new Vector<Option>();
961	result.addElement(new Option("\tmaximum number of iterations\n"
962	+ "\t(default 100).", "I", 1, "-I <num>"));
963	result.addElement(new Option(
964	"\tminimum number of changes in a single iteration\n"
965	+ "\t(default 0).", "M", 1, "-M <num>"));
966	result.addElement(new Option("\tnumber of clusters.\n" + "\t(default 2).",
967	"N", 1, "-N <num>"));
968	result.addElement(new Option("\tnumber of restarts.\n"
969	+ "\t(default 5).", "R", 1, "-R <num>"));
970	result.addElement(new Option("\tset not to normalize the data\n"
971	+ "\t(default true).", "U", 0, "-U"));
972	result.addElement(new Option("\tset to output debug info\n"
973	+ "\t(default false).", "V", 0, "-V"));
974
975	Enumeration en = super.listOptions();
976	while (en.hasMoreElements())
977	result.addElement((Option) en.nextElement());
978
979	return result.elements();
980	}
981
982	/**
983	* Gets the current settings.
984	* @return an array of strings suitable for passing to setOptions()
985	*/
986	public String[] getOptions() {
987	Vector<String> result;
988	result = new Vector<String>();
989	result.add("-I");
990	result.add("" + getMaxIterations());
991	result.add("-M");
992	result.add("" + getMinChange());
993	result.add("-N");
994	result.add("" + getNumClusters());
995	result.add("-R");
996	result.add("" + getNumRestarts());
997	if(getNotUnifyNorm()) {
998	result.add("-U");
999	}
1000	if(getDebug()) {
1001	result.add("-V");
1002	}
1003
1004	String[] options = super.getOptions();
1005	for (int i = 0; i < options.length; i++){
1006	result.add(options[i]);
1007	}
1008	return result.toArray(new String[result.size()]);
1009	}
1010
1011	/**
1012	* Returns the tip text for this property
1013	* @return tip text for this property suitable for
1014	* displaying in the explorer/experimenter gui
1015	*/
1016	public String debugTipText() {
1017	return "If set to true, clusterer may output additional info to " +
1018	"the console.";
1019	}
1020
1021	/**
1022	* Set debug mode - verbose output
1023	* @param v true for verbose output
1024	*/
1025	public void setDebug (boolean v) {
1026	m_verbose = v;
1027	}
1028
1029	/**
1030	* Get debug mode
1031	* @return true if debug mode is set
1032	*/
1033	public boolean getDebug () {
1034	return m_verbose;
1035	}
1036
1037	/**
1038	* Returns the tip text for this property.
1039	* @return tip text for this property
1040	*/
1041	public String maxIterationsTipText() {
1042	return "set maximum number of iterations (default 100)";
1043	}
1044
1045	/**
1046	* Set the max number of iterations
1047	* @param i max number of iterations
1048	*/
1049	public void setMaxIterations(int i) {
1050	m_maxLoop = i;
1051	}
1052
1053	/**
1054	* Get the max number of iterations
1055	* @return max number of iterations
1056	*/
1057	public int getMaxIterations() {
1058	return m_maxLoop;
1059	}
1060
1061	/**
1062	* Returns the tip text for this property.
1063	* @return tip text for this property
1064	*/
1065	public String minChangeTipText() {
1066	return "set minimum number of changes (default 0)";
1067	}
1068
1069	/**
1070	* set the minimum number of changes
1071	* @param m the minimum number of changes
1072	*/
1073	public void setMinChange(int m) {
1074	m_minChange = m;
1075	}
1076
1077	/**
1078	* get the minimum number of changes
1079	* @return the minimum number of changes
1080	*/
1081	public int getMinChange() {
1082	return m_minChange;
1083	}
1084
1085	/**
1086	* Returns the tip text for this property.
1087	* @return tip text for this property
1088	*/
1089	public String numClustersTipText() {
1090	return "set number of clusters (default 2)";
1091	}
1092
1093	/**
1094	* Set the number of clusters
1095	* @param n number of clusters
1096	*/
1097	public void setNumClusters(int n) {
1098	m_numCluster = n;
1099	}
1100
1101	/**
1102	* Get the number of clusters
1103	* @return the number of clusters
1104	*/
1105	public int getNumClusters() {
1106	return m_numCluster;
1107	}
1108
1109	/**
1110	* Get the number of clusters
1111	* @return the number of clusters
1112	*/
1113	public int numberOfClusters() {
1114	return m_numCluster;
1115	}
1116
1117	/**
1118	* Returns the tip text for this property.
1119	* @return tip text for this property
1120	*/
1121	public String numRestartsTipText() {
1122	return "set number of restarts (default 5)";
1123	}
1124
1125	/**
1126	* Set the number of restarts
1127	* @param i number of restarts
1128	*/
1129	public void setNumRestarts(int i) {
1130	m_numRestarts = i;
1131	}
1132
1133	/**
1134	* Get the number of restarts
1135	* @return number of restarts
1136	*/
1137	public int getNumRestarts(){
1138	return m_numRestarts;
1139	}
1140
1141	/**
1142	* Returns the tip text for this property.
1143	* @return tip text for this property
1144	*/
1145	public String notUnifyNormTipText() {
1146	return "set whether to normalize each instance to a unify prior probability (eg. 1).";
1147	}
1148
1149	/**
1150	* Set whether to normalize instances to unify prior probability
1151	* before building the clusterer
1152	* @param b true to normalize, otherwise false
1153	*/
1154	public void setNotUnifyNorm(boolean b){
1155	m_uniformPrior = !b;
1156	}
1157
1158	/**
1159	* Get whether to normalize instances to unify prior probability
1160	* before building the clusterer
1161	* @return true if set to normalize, false otherwise
1162	*/
1163	public boolean getNotUnifyNorm() {
1164	return !m_uniformPrior;
1165	}
1166
1167	/**
1168	* Returns a string describing this clusterer
1169	* @return a description of the clusterer suitable for
1170	* displaying in the explorer/experimenter gui
1171	*/
1172	public String globalInfo() {
1173	return "Cluster data using the sequential information bottleneck algorithm.\n\n" +
1174	"Note: only hard clustering scheme is supported. sIB assign for each " +
1175	"instance the cluster that have the minimum cost/distance to the instance. " +
1176	"The trade-off beta is set to infinite so 1/beta is zero.\n\n" +
1177	"For more information, see:\n\n"
1178	+getTechnicalInformation().toString();
1179	}
1180
1181	/**
1182	* Returns an instance of a TechnicalInformation object, containing
1183	* detailed information about the technical background of this class,
1184	* e.g., paper reference or book this class is based on.
1185	* @return the technical information about this class
1186	*/
1187	public TechnicalInformation getTechnicalInformation() {
1188	TechnicalInformation result;
1189
1190	result = new TechnicalInformation(Type.INPROCEEDINGS);
1191	result.setValue(Field.AUTHOR, "Noam Slonim and Nir Friedman and Naftali Tishby");
1192	result.setValue(Field.YEAR, "2002");
1193	result.setValue(Field.TITLE, "Unsupervised document classification using sequential information maximization");
1194	result.setValue(Field.BOOKTITLE, "Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval");
1195	result.setValue(Field.PAGES, "129-136");
1196
1197	return result;
1198	}
1199
1200	/**
1201	* Returns default capabilities of the clusterer.
1202	* @return the capabilities of this clusterer
1203	*/
1204	public Capabilities getCapabilities() {
1205	Capabilities result = super.getCapabilities();
1206	result.disableAll();
1207	result.enable(Capability.NO_CLASS);
1208
1209	// attributes
1210	result.enable(Capability.NUMERIC_ATTRIBUTES);
1211	return result;
1212	}
1213
1214	public String toString(){
1215	StringBuffer text = new StringBuffer();
1216	text.append("\nsIB\n===\n");
1217	text.append("\nNumber of clusters: " + m_numCluster + "\n");
1218
1219	for (int j = 0; j < m_numCluster; j++) {
1220	text.append("\nCluster: " + j + " Size : " + bestT.size(j) + " Prior probability: "
1221	+ Utils.doubleToString(bestT.Pt[j], 4) + "\n\n");
1222	for (int i = 0; i < m_numAttributes; i++) {
1223	text.append("Attribute: " + m_data.attribute(i).name() + "\n");
1224	text.append("Probability given the cluster = "
1225	+ Utils.doubleToString(bestT.Py_t.get(i, j), 4)
1226	+ "\n");
1227	}
1228	}
1229	return text.toString();
1230	}
1231
1232	/**
1233	* Returns the revision string.
1234	*
1235	* @return the revision
1236	*/
1237	public String getRevision() {
1238	return RevisionUtils.extract("$Revision: 5987 $");
1239	}
1240
1241	public static void main(String[] argv) {
1242	runClusterer(new sIB(), argv);
1243	}
1244	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: