/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * sIB.java * Copyright (C) 2008 University of Waikato, Hamilton, New Zealand * */ package weka.clusterers; import weka.core.Capabilities; import weka.core.Instance; import weka.core.DenseInstance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.matrix.Matrix; import weka.filters.unsupervised.attribute.ReplaceMissingValues; import java.io.Serializable; import java.util.ArrayList; import java.util.Enumeration; import java.util.Random; import java.util.Vector; /** * Cluster data using the sequential information bottleneck algorithm.
*
* Note: only hard clustering scheme is supported. sIB assign for each instance the cluster that have the minimum cost/distance to the instance. The trade-off beta is set to infinite so 1/beta is zero.
*
* For more information, see:
*
* Noam Slonim, Nir Friedman, Naftali Tishby: Unsupervised document classification using sequential information maximization. In: Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval, 129-136, 2002. *

* * BibTeX: *

 * @inproceedings{Slonim2002,
 *    author = {Noam Slonim and Nir Friedman and Naftali Tishby},
 *    booktitle = {Proceedings of the 25th International ACM SIGIR Conference on Research and Development in Information Retrieval},
 *    pages = {129-136},
 *    title = {Unsupervised document classification using sequential information maximization},
 *    year = {2002}
 * }
 * 
*

* * Valid options are:

* *

 -I <num>
 *  maximum number of iterations
 *  (default 100).
* *
 -M <num>
 *  minimum number of changes in a single iteration
 *  (default 0).
* *
 -N <num>
 *  number of clusters.
 *  (default 2).
* *
 -R <num>
 *  number of restarts.
 *  (default 5).
* *
 -U
 *  set not to normalize the data
 *  (default true).
* *
 -V
 *  set to output debug info
 *  (default false).
* *
 -S <num>
 *  Random number seed.
 *  (default 1)
* * * @author Noam Slonim * @author Anna Huang * @version $Revision: 5987 $ */ public class sIB extends RandomizableClusterer implements TechnicalInformationHandler { /** for serialization. */ private static final long serialVersionUID = -8652125897352654213L; /** * Inner class handling status of the input data * * @see Serializable */ private class Input implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = -2464453171263384037L; /** Prior probability of each instance */ private double[] Px; /** Prior probability of each attribute */ private double[] Py; /** Joint distribution of attribute and instance */ private Matrix Pyx; /** P[y|x] */ private Matrix Py_x; /** Mutual information between the instances and the attributes */ private double Ixy; /** Entropy of the attributes */ private double Hy; /** Entropy of the instances */ private double Hx; /** Sum values of the dataset */ private double sumVals; /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5987 $"); } } /** * Internal class handling the whole partition * * @see Serializable */ private class Partition implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = 4957194978951259946L; /** Cluster assignment for each instance */ private int[] Pt_x; /** Prior probability of each cluster */ private double[] Pt; /** sIB equation score, to evaluate the quality of the partition */ private double L; /** Number of changes during the generation of this partition */ private int counter; /** Attribute probablities for each cluster */ private Matrix Py_t; /** * Create a new empty Partition instance. */ public Partition() { Pt_x = new int[m_numInstances]; for (int i = 0; i < m_numInstances; i++) { Pt_x[i] = -1; } Pt = new double[m_numCluster]; Py_t = new Matrix(m_numAttributes, m_numCluster); counter = 0; } /** * Find all the instances that have been assigned to cluster i * @param i index of the cluster * @return an arraylist of the instance ids that have been assigned to cluster i */ private ArrayList find(int i) { ArrayList indices = new ArrayList(); for (int x = 0; x < Pt_x.length; x++) { if (Pt_x[x] == i) { indices.add(x); } } return indices; } /** * Find the size of the cluster i * @param i index of the cluster * @return the size of cluster i */ private int size(int i) { int count = 0; for (int x = 0; x < Pt_x.length; x++) { if (Pt_x[x] == i) { count++; } } return count; } /** * Copy the current partition into T * @param T the target partition object */ private void copy(Partition T) { if (T == null) { T = new Partition(); } System.arraycopy(Pt_x, 0, T.Pt_x, 0, Pt_x.length); System.arraycopy(Pt, 0, T.Pt, 0, Pt.length); T.L = L; T.counter = counter; double[][] mArray = Py_t.getArray(); double[][] tgtArray = T.Py_t.getArray(); for (int i = 0; i < mArray.length; i++) { System.arraycopy(mArray[i], 0, tgtArray[i], 0, mArray[0].length); } } /** * Output the current partition * @param insts * @return a string that describes the partition */ public String toString() { StringBuffer text = new StringBuffer(); text.append("score (L) : " + Utils.doubleToString(L, 4) + "\n"); text.append("number of changes : " + counter +"\n"); for (int i = 0; i < m_numCluster; i++) { text.append("\nCluster "+i+"\n"); text.append("size : "+size(i)+"\n"); text.append("prior prob : "+Utils.doubleToString(Pt[i], 4)+"\n"); } return text.toString(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5987 $"); } } /** Training data */ private Instances m_data; /** Number of clusters */ private int m_numCluster = 2; /** Number of restarts */ private int m_numRestarts = 5; /** Verbose? */ private boolean m_verbose = false; /** Uniform prior probability of the documents */ private boolean m_uniformPrior = true; /** Max number of iterations during each restart */ private int m_maxLoop = 100; /** Minimum number of changes */ private int m_minChange = 0; /** Globally replace missing values */ private ReplaceMissingValues m_replaceMissing; /** Number of instances */ private int m_numInstances; /** Number of attributes */ private int m_numAttributes; /** Randomly generate initial partition */ private Random random; /** Holds the best partition built */ private Partition bestT; /** Holds the statistics about the input dataset */ private Input input; /** * Generates a clusterer. * * @param data the training instances * @throws Exception if something goes wrong */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data ? getCapabilities().testWithFail(data); m_replaceMissing = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); m_replaceMissing.setInputFormat(instances); data = weka.filters.Filter.useFilter(instances, m_replaceMissing); instances = null; // initialize all fields that are not being set via options m_data = data; m_numInstances = m_data.numInstances(); m_numAttributes = m_data.numAttributes(); random = new Random(getSeed()); // initialize the statistics of the input training data input = sIB_ProcessInput(); // object to hold the best partition bestT = new Partition(); // the real clustering double bestL = Double.NEGATIVE_INFINITY; for (int k = 0; k < m_numRestarts; k++) { if(m_verbose) { System.out.format("restart number %s...\n", k); } // initialize the partition and optimize it Partition tmpT = sIB_InitT(input); tmpT = sIB_OptimizeT(tmpT, input); // if a better partition is found, save it if (tmpT.L > bestL) { tmpT.copy(bestT); bestL = bestT.L; } if(m_verbose) { System.out.println("\nPartition status : "); System.out.println("------------------"); System.out.println(tmpT.toString()+"\n"); } } if(m_verbose){ System.out.println("\nBest Partition"); System.out.println("==============="); System.out.println(bestT.toString()); } // save memory m_data = new Instances(m_data, 0); } /** * Cluster a given instance, this is the method defined in Clusterer * interface do nothing but just return the cluster assigned to it */ public int clusterInstance(Instance instance) throws Exception { double prior = (double) 1 / input.sumVals; double[] distances = new double[m_numCluster]; for(int i = 0; i < m_numCluster; i++){ double Pnew = bestT.Pt[i] + prior; double pi1 = prior / Pnew; double pi2 = bestT.Pt[i] / Pnew; distances[i] = Pnew * JS(instance, i, pi1, pi2); } return Utils.minIndex(distances); } /** * Process the input and compute the statistics of the training data * @return an Input object which holds the statistics about the training data */ private Input sIB_ProcessInput() { double valSum = 0.0; for (int i = 0; i < m_numInstances; i++) { valSum = 0.0; for (int v = 0; v < m_data.instance(i).numValues(); v++) { valSum += m_data.instance(i).valueSparse(v); } if (valSum <= 0) { if(m_verbose){ System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum); } m_data.delete(i); m_numInstances--; } } // get the term-document matrix Input input = new Input(); input.Py_x = getTransposedNormedMatrix(m_data); if (m_uniformPrior) { input.Pyx = input.Py_x.copy(); normalizePrior(m_data); } else { input.Pyx = getTransposedMatrix(m_data); } input.sumVals = getTotalSum(m_data); input.Pyx.timesEquals((double) 1 / input.sumVals); // prior probability of documents, ie. sum the columns from the Pyx matrix input.Px = new double[m_numInstances]; for (int i = 0; i < m_numInstances; i++) { for (int j = 0; j < m_numAttributes; j++) { input.Px[i] += input.Pyx.get(j, i); } } // prior probability of terms, ie. sum the rows from the Pyx matrix input.Py = new double[m_numAttributes]; for (int i = 0; i < input.Pyx.getRowDimension(); i++) { for (int j = 0; j < input.Pyx.getColumnDimension(); j++) { input.Py[i] += input.Pyx.get(i, j); } } MI(input.Pyx, input); return input; } /** * Initialize the partition * @param input object holding the statistics of the training data * @return the initialized partition */ private Partition sIB_InitT(Input input) { Partition T = new Partition(); int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster); ArrayList permInstsIdx = new ArrayList(); ArrayList unassigned = new ArrayList(); for (int i = 0; i < m_numInstances; i++) { unassigned.add(i); } while (unassigned.size() != 0) { int t = random.nextInt(unassigned.size()); permInstsIdx.add(unassigned.get(t)); unassigned.remove(t); } for (int i = 0; i < m_numCluster; i++) { int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize; for (int j = 0; j < r2; j++) { T.Pt_x[permInstsIdx.get(j)] = i; } for (int j = 0; j < r2; j++) { permInstsIdx.remove(0); } } // initialize the prior prob of each cluster, and the probability // for each attribute within the cluster for (int i = 0; i < m_numCluster; i++) { ArrayList indices = T.find(i); for (int j = 0; j < indices.size(); j++) { T.Pt[i] += input.Px[indices.get(j)]; } double[][] mArray = input.Pyx.getArray(); for (int j = 0; j < m_numAttributes; j++) { double sum = 0.0; for (int k = 0; k < indices.size(); k++) { sum += mArray[j][indices.get(k)]; } sum /= T.Pt[i]; T.Py_t.set(j, i, sum); } } if(m_verbose) { System.out.println("Initializing..."); } return T; } /** * Optimize the partition * @param tmpT partition to be optimized * @param input object describing the statistics of the training dataset * @return the optimized partition */ private Partition sIB_OptimizeT(Partition tmpT, Input input) { boolean done = false; int change = 0, loopCounter = 0; if(m_verbose) { System.out.println("Optimizing..."); System.out.println("-------------"); } while (!done) { change = 0; for (int i = 0; i < m_numInstances; i++) { int old_t = tmpT.Pt_x[i]; // If the current cluster only has one instance left, leave it. if (tmpT.size(old_t) == 1) { if(m_verbose){ System.out.format("cluster %s has only 1 doc remain\n", old_t); } continue; } // draw the instance out from its previous cluster reduce_x(i, old_t, tmpT, input); // re-cluster the instance int new_t = clusterInstance(i, input, tmpT); if (new_t != old_t) { change++; updateAssignment(i, new_t, tmpT, input.Px[i], input.Py_x); } } tmpT.counter += change; if(m_verbose){ System.out.format("iteration %s , changes : %s\n", loopCounter, change); } done = checkConvergence(change, loopCounter); loopCounter++; } // compute the sIB score tmpT.L = sIB_local_MI(tmpT.Py_t, tmpT.Pt); if(m_verbose){ System.out.format("score (L) : %s \n", Utils.doubleToString(tmpT.L, 4)); } return tmpT; } /** * Draw a instance out from a cluster. * @param instIdx index of the instance to be drawn out * @param t index of the cluster which the instance previously belong to * @param T the current working partition * @param input the input statistics */ private void reduce_x(int instIdx, int t, Partition T, Input input) { // Update the prior probability of the cluster ArrayList indices = T.find(t); double sum = 0.0; for (int i = 0; i < indices.size(); i++) { if (indices.get(i) == instIdx) continue; sum += input.Px[indices.get(i)]; } T.Pt[t] = sum; if (T.Pt[t] < 0) { System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]); T.Pt[t] = 0; } // Update prob of each attribute in the cluster double[][] mArray = input.Pyx.getArray(); for (int i = 0; i < m_numAttributes; i++) { sum = 0.0; for (int j = 0; j < indices.size(); j++) { if (indices.get(j) == instIdx) continue; sum += mArray[i][indices.get(j)]; } T.Py_t.set(i, t, sum / T.Pt[t]); } } /** * Put an instance into a new cluster and update. * @param instIdx instance to be updated * @param newt index of the new cluster this instance has been assigned to * @param T the current working partition * @param Px an array of prior probabilities of the instances */ private void updateAssignment(int instIdx, int newt, Partition T, double Px, Matrix Py_x) { T.Pt_x[instIdx] = newt; // update probability of attributes in the cluster double mass = Px + T.Pt[newt]; double pi1 = Px / mass; double pi2 = T.Pt[newt] / mass; for (int i = 0; i < m_numAttributes; i++) { T.Py_t.set(i, newt, pi1 * Py_x.get(i, instIdx) + pi2 * T.Py_t.get(i, newt)); } T.Pt[newt] = mass; } /** * Check whether the current iteration is converged * @param change number of changes in current iteration * @param loops number of iterations done * @return true if the iteration is converged, false otherwise */ private boolean checkConvergence(int change, int loops) { if (change <= m_minChange || loops >= m_maxLoop) { if(m_verbose){ System.out.format("\nsIB converged after %s iterations with %s changes\n", loops, change); } return true; } return false; } /** * Cluster an instance into the nearest cluster. * @param instIdx Index of the instance to be clustered * @param input Object which describe the statistics of the training dataset * @param T Partition * @return index of the cluster that has the minimum distance to the instance */ private int clusterInstance(int instIdx, Input input, Partition T) { double[] distances = new double[m_numCluster]; for (int i = 0; i < m_numCluster; i++) { double Pnew = input.Px[instIdx] + T.Pt[i]; double pi1 = input.Px[instIdx] / Pnew; double pi2 = T.Pt[i] / Pnew; distances[i] = Pnew * JS(instIdx, input, T, i, pi1, pi2); } return Utils.minIndex(distances); } /** * Compute the JS divergence between an instance and a cluster, used for training data * @param instIdx index of the instance * @param input statistics of the input data * @param T the whole partition * @param t index of the cluster * @param pi1 * @param pi2 * @return the JS divergence */ private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) { if (Math.min(pi1, pi2) <= 0) { System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2); return 0; } Instance inst = m_data.instance(instIdx); double kl1 = 0.0, kl2 = 0.0, tmp = 0.0; for (int i = 0; i < inst.numValues(); i++) { tmp = input.Py_x.get(inst.index(i), instIdx); if(tmp != 0) { kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t))); } } for (int i = 0; i < m_numAttributes; i++) { if ((tmp = T.Py_t.get(i, t)) != 0) { kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp)); } } return pi1 * kl1 + pi2 * kl2; } /** * Compute the JS divergence between an instance and a cluster, used for test data * @param inst instance to be clustered * @param t index of the cluster * @param pi1 * @param pi2 * @return the JS divergence */ private double JS(Instance inst, int t, double pi1, double pi2) { if (Math.min(pi1, pi2) <= 0) { System.out.format("Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2); return 0; } double sum = Utils.sum(inst.toDoubleArray()); double kl1 = 0.0, kl2 = 0.0, tmp = 0.0; for (int i = 0; i < inst.numValues(); i++) { tmp = inst.valueSparse(i) / sum; if(tmp != 0) { kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t))); } } for (int i = 0; i < m_numAttributes; i++) { if ((tmp = bestT.Py_t.get(i, t)) != 0) { kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1 / sum + pi2 * tmp)); } } return pi1 * kl1 + pi2 * kl2; } /** * Compute the sIB score * @param m a term-cluster matrix, with m[i, j] is the probability of term i given cluster j * @param Pt an array of cluster prior probabilities * @return the sIB score which indicates the quality of the partition */ private double sIB_local_MI(Matrix m, double[] Pt) { double Hy = 0.0, Ht = 0.0; for (int i = 0; i < Pt.length; i++) { Ht += Pt[i] * Math.log(Pt[i]); } Ht = -Ht; for (int i = 0; i < m_numAttributes; i++) { double Py = 0.0; for (int j = 0; j < m_numCluster; j++) { Py += m.get(i, j) * Pt[j]; } if(Py == 0) continue; Hy += Py * Math.log(Py); } Hy = -Hy; double Hyt = 0.0, tmp = 0.0; for (int i = 0; i < m.getRowDimension(); i++) { for (int j = 0; j < m.getColumnDimension(); j++) { if ((tmp = m.get(i, j)) == 0 || Pt[j] == 0) { continue; } tmp *= Pt[j]; Hyt += tmp * Math.log(tmp); } } return Hy + Ht + Hyt; } /** * Get the sum of value of the dataset * @param data set of instances to handle * @return sum of all the attribute values for all the instances in the dataset */ private double getTotalSum(Instances data) { double sum = 0.0; for (int i = 0; i < data.numInstances(); i++) { for (int v = 0; v < data.instance(i).numValues(); v++) { sum += data.instance(i).valueSparse(v); } } return sum; } /** * Transpose the document-term matrix to term-document matrix * @param data instances with document-term info * @return a term-document matrix transposed from the input dataset */ private Matrix getTransposedMatrix(Instances data) { double[][] temp = new double[data.numAttributes()][data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); for (int v = 0; v < inst.numValues(); v++) { temp[inst.index(v)][i] = inst.valueSparse(v); } } Matrix My_x = new Matrix(temp); return My_x; } /** * Normalize the document vectors * @param data instances to be normalized */ private void normalizePrior(Instances data) { for (int i = 0; i < data.numInstances(); i++) { normalizeInstance(data.instance(i)); } } /** * Normalize the instance * @param inst instance to be normalized * @return a new Instance with normalized values */ private Instance normalizeInstance(Instance inst) { double[] vals = inst.toDoubleArray(); double sum = Utils.sum(vals); for(int i = 0; i < vals.length; i++) { vals[i] /= sum; } return new DenseInstance(inst.weight(), vals); } private Matrix getTransposedNormedMatrix(Instances data) { Matrix matrix = new Matrix(data.numAttributes(), data.numInstances()); for(int i = 0; i < data.numInstances(); i++){ double[] vals = data.instance(i).toDoubleArray(); double sum = Utils.sum(vals); for (int v = 0; v < vals.length; v++) { vals[v] /= sum; matrix.set(v, i, vals[v]); } } return matrix; } /** * Compute the MI between instances and attributes * @param m the term-document matrix * @param input object that describes the statistics about the training data */ private void MI(Matrix m, Input input){ int minDimSize = m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension(); if(minDimSize < 2){ System.err.println("Warning : This is not a JOINT distribution"); input.Hx = Entropy (m); input.Hy = 0; input.Ixy = 0; return; } input.Hx = Entropy(input.Px); input.Hy = Entropy(input.Py); double entropy = input.Hx + input.Hy; for (int i=0; i < m_numInstances; i++) { Instance inst = m_data.instance(i); for (int v = 0; v < inst.numValues(); v++) { double tmp = m.get(inst.index(v), i); if(tmp <= 0) continue; entropy += tmp * Math.log(tmp); } } input.Ixy = entropy; if(m_verbose) { System.out.println("Ixy = " + input.Ixy); } } /** * Compute the entropy score based on an array of probabilities * @param probs array of non-negative and normalized probabilities * @return the entropy value */ private double Entropy(double[] probs){ for (int i = 0; i < probs.length; i++){ if (probs[i] <= 0) { if(m_verbose) { System.out.println("Warning: Negative probability."); } return Double.NaN; } } // could be unormalized, when normalization is not specified if(Math.abs(Utils.sum(probs)-1) >= 1e-6) { if(m_verbose) { System.out.println("Warning: Not normalized."); } return Double.NaN; } double mi = 0.0; for (int i = 0; i < probs.length; i++) { mi += probs[i] * Math.log(probs[i]); } mi = -mi; return mi; } /** * Compute the entropy score based on a matrix * @param p a matrix with non-negative and normalized probabilities * @return the entropy value */ private double Entropy(Matrix p) { double mi = 0; for (int i = 0; i < p.getRowDimension(); i++) { for (int j = 0; j < p.getColumnDimension(); j++) { if(p.get(i, j) == 0){ continue; } mi += p.get(i, j) + Math.log(p.get(i, j)); } } mi = -mi; return mi; } /** * Parses a given list of options.

* * Valid options are:

* *

 -I <num>
   *  maximum number of iterations
   *  (default 100).
* *
 -M <num>
   *  minimum number of changes in a single iteration
   *  (default 0).
* *
 -N <num>
   *  number of clusters.
   *  (default 2).
* *
 -R <num>
   *  number of restarts.
   *  (default 5).
* *
 -U
   *  set not to normalize the data
   *  (default true).
* *
 -V
   *  set to output debug info
   *  (default false).
* *
 -S <num>
   *  Random number seed.
   *  (default 1)
* * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String optionString = Utils.getOption('I', options); if (optionString.length() != 0) { setMaxIterations(Integer.parseInt(optionString)); } optionString = Utils.getOption('M', options); if (optionString.length() != 0) { setMinChange((new Integer(optionString)).intValue()); } optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } optionString = Utils.getOption('R', options); if (optionString.length() != 0) { setNumRestarts((new Integer(optionString)).intValue()); } setNotUnifyNorm(Utils.getFlag('U', options)); setDebug(Utils.getFlag('V', options)); super.setOptions(options); } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector