/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* HierarchicalClusterer.java
* Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
*/
/**
* Hierarchical clustering class.
* Implements a number of classic hierarchical clustering methods.
*
* Valid options are:
*
* -N
* number of clusters
*
*
*
* -L
* Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
* [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]
*
*
* -A
* Distance function to use. (default: weka.core.EuclideanDistance)
*
*
* -P
* Print hierarchy in Newick format, which can be used for display in other programs.
*
*
* -D
* If set, classifier is run in debug mode and may output additional info to the console.
*
*
* -B
* \If set, distance is interpreted as branch length, otherwise it is node height.
*
*
*
*
*
* @author Remco Bouckaert (rrb@xm.co.nz, remco@cs.waikato.ac.nz)
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version $Revision: 6042 $
*/
package weka.clusterers;
import java.io.Serializable;
import java.text.DecimalFormat;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.PriorityQueue;
import java.util.Vector;
import weka.core.Capabilities;
import weka.core.CapabilitiesHandler;
import weka.core.DistanceFunction;
import weka.core.Drawable;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
public class HierarchicalClusterer extends AbstractClusterer implements OptionHandler, CapabilitiesHandler, Drawable {
private static final long serialVersionUID = 1L;
/** Whether the classifier is run in debug mode. */
protected boolean m_bDebug = false;
/** Whether the distance represent node height (if false) or branch length (if true). */
protected boolean m_bDistanceIsBranchLength = false;
/** training data **/
Instances m_instances;
/** number of clusters desired in clustering **/
int m_nNumClusters = 2;
public void setNumClusters(int nClusters) {m_nNumClusters = Math.max(1,nClusters);}
public int getNumClusters() {return m_nNumClusters;}
/** distance function used for comparing members of a cluster **/
protected DistanceFunction m_DistanceFunction = new EuclideanDistance();
public DistanceFunction getDistanceFunction() {return m_DistanceFunction;}
public void setDistanceFunction(DistanceFunction distanceFunction) {m_DistanceFunction = distanceFunction;}
/** used for priority queue for efficient retrieval of pair of clusters to merge**/
class Tuple {
public Tuple(double d, int i, int j, int nSize1, int nSize2) {
m_fDist = d;
m_iCluster1 = i;
m_iCluster2 = j;
m_nClusterSize1 = nSize1;
m_nClusterSize2 = nSize2;
}
double m_fDist;
int m_iCluster1;
int m_iCluster2;
int m_nClusterSize1;
int m_nClusterSize2;
}
/** comparator used by priority queue**/
class TupleComparator implements Comparator {
public int compare(Tuple o1, Tuple o2) {
if (o1.m_fDist < o2.m_fDist) {
return -1;
} else if (o1.m_fDist == o2.m_fDist) {
return 0;
}
return 1;
}
}
/** the various link types */
final static int SINGLE = 0;
final static int COMPLETE = 1;
final static int AVERAGE = 2;
final static int MEAN = 3;
final static int CENTROID = 4;
final static int WARD = 5;
final static int ADJCOMLPETE = 6;
final static int NEIGHBOR_JOINING = 7;
public static final Tag[] TAGS_LINK_TYPE = {
new Tag(SINGLE, "SINGLE"),
new Tag(COMPLETE, "COMPLETE"),
new Tag(AVERAGE, "AVERAGE"),
new Tag(MEAN, "MEAN"),
new Tag(CENTROID, "CENTROID"),
new Tag(WARD, "WARD"),
new Tag(ADJCOMLPETE,"ADJCOMLPETE"),
new Tag(NEIGHBOR_JOINING,"NEIGHBOR_JOINING")
};
/**
* Holds the Link type used calculate distance between clusters
*/
int m_nLinkType = SINGLE;
boolean m_bPrintNewick = true;;
public boolean getPrintNewick() {return m_bPrintNewick;}
public void setPrintNewick(boolean bPrintNewick) {m_bPrintNewick = bPrintNewick;}
public void setLinkType(SelectedTag newLinkType) {
if (newLinkType.getTags() == TAGS_LINK_TYPE) {
m_nLinkType = newLinkType.getSelectedTag().getID();
}
}
public SelectedTag getLinkType() {
return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE);
}
/** class representing node in cluster hierarchy **/
class Node implements Serializable {
Node m_left;
Node m_right;
Node m_parent;
int m_iLeftInstance;
int m_iRightInstance;
double m_fLeftLength = 0;
double m_fRightLength = 0;
double m_fHeight = 0;
public String toString(int attIndex) {
DecimalFormat myFormatter = new DecimalFormat("#.#####");
if (m_left == null) {
if (m_right == null) {
return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_instances.instance(m_iRightInstance).stringValue(attIndex) +":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
}
} else {
if (m_right == null) {
return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
}
}
}
public String toString2(int attIndex) {
DecimalFormat myFormatter = new DecimalFormat("#.#####");
if (m_left == null) {
if (m_right == null) {
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_instances.instance(m_iRightInstance).value(attIndex) +":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
}
} else {
if (m_right == null) {
return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +
m_instances.instance(m_iRightInstance).value(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," +m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
}
}
}
void setHeight(double fHeight1, double fHeight2) {
m_fHeight = fHeight1;
if (m_left == null) {
m_fLeftLength = fHeight1;
} else {
m_fLeftLength = fHeight1 - m_left.m_fHeight;
}
if (m_right == null) {
m_fRightLength = fHeight2;
} else {
m_fRightLength = fHeight2 - m_right.m_fHeight;
}
}
void setLength(double fLength1, double fLength2) {
m_fLeftLength = fLength1;
m_fRightLength = fLength2;
m_fHeight = fLength1;
if (m_left != null) {
m_fHeight += m_left.m_fHeight;
}
}
}
Node [] m_clusters;
int [] m_nClusterNr;
@Override
public void buildClusterer(Instances data) throws Exception {
// /System.err.println("Method " + m_nLinkType);
m_instances = data;
int nInstances = m_instances.numInstances();
if (nInstances == 0) {
return;
}
m_DistanceFunction.setInstances(m_instances);
// use array of integer vectors to store cluster indices,
// starting with one cluster per instance
Vector [] nClusterID = new Vector[data.numInstances()];
for (int i = 0; i < data.numInstances(); i++) {
nClusterID[i] = new Vector();
nClusterID[i].add(i);
}
// calculate distance matrix
int nClusters = data.numInstances();
// used for keeping track of hierarchy
Node [] clusterNodes = new Node[nInstances];
if (m_nLinkType == NEIGHBOR_JOINING) {
neighborJoining(nClusters, nClusterID, clusterNodes);
} else {
doLinkClustering(nClusters, nClusterID, clusterNodes);
}
// move all clusters in m_nClusterID array
// & collect hierarchy
int iCurrent = 0;
m_clusters = new Node[m_nNumClusters];
m_nClusterNr = new int[nInstances];
for (int i = 0; i < nInstances; i++) {
if (nClusterID[i].size() > 0) {
for (int j = 0; j < nClusterID[i].size(); j++) {
m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent;
}
m_clusters[iCurrent] = clusterNodes[i];
iCurrent++;
}
}
} // buildClusterer
/** use neighbor joining algorithm for clustering
* This is roughly based on the RapidNJ simple implementation and runs at O(n^3)
* More efficient implementations exist, see RapidNJ (or my GPU implementation :-))
* @param nClusters
* @param nClusterID
* @param clusterNodes
*/
void neighborJoining(int nClusters, Vector[] nClusterID, Node [] clusterNodes) {
int n = m_instances.numInstances();
double [][] fDist = new double[nClusters][nClusters];
for (int i = 0; i < nClusters; i++) {
fDist[i][i] = 0;
for (int j = i+1; j < nClusters; j++) {
fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
fDist[j][i] = fDist[i][j];
}
}
double [] fSeparationSums = new double [n];
double [] fSeparations = new double [n];
int [] nNextActive = new int[n];
//calculate initial separation rows
for(int i = 0; i < n; i++){
double fSum = 0;
for(int j = 0; j < n; j++){
fSum += fDist[i][j];
}
fSeparationSums[i] = fSum;
fSeparations[i] = fSum / (nClusters - 2);
nNextActive[i] = i +1;
}
while (nClusters > 2) {
// find minimum
int iMin1 = -1;
int iMin2 = -1;
double fMin = Double.MAX_VALUE;
if (m_bDebug) {
for (int i = 0; i < n; i++) {
if(nClusterID[i].size() > 0){
double [] fRow = fDist[i];
double fSep1 = fSeparations[i];
for(int j = 0; j < n; j++){
if(nClusterID[j].size() > 0 && i != j){
double fSep2 = fSeparations[j];
double fVal = fRow[j] - fSep1 - fSep2;
if(fVal < fMin){
// new minimum
iMin1 = i;
iMin2 = j;
fMin = fVal;
}
}
}
}
}
} else {
int i = 0;
while (i < n) {
double fSep1 = fSeparations[i];
double [] fRow = fDist[i];
int j = nNextActive[i];
while (j < n) {
double fSep2 = fSeparations[j];
double fVal = fRow[j] - fSep1 - fSep2;
if(fVal < fMin){
// new minimum
iMin1 = i;
iMin2 = j;
fMin = fVal;
}
j = nNextActive[j];
}
i = nNextActive[i];
}
}
// record distance
double fMinDistance = fDist[iMin1][iMin2];
nClusters--;
double fSep1 = fSeparations[iMin1];
double fSep2 = fSeparations[iMin2];
double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2));
double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1));
if (nClusters > 2) {
// update separations & distance
double fNewSeparationSum = 0;
double fMutualDistance = fDist[iMin1][iMin2];
double[] fRow1 = fDist[iMin1];
double[] fRow2 = fDist[iMin2];
for(int i = 0; i < n; i++) {
if(i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) {
fRow1[i] = 0;
} else {
double fVal1 = fRow1[i];
double fVal2 = fRow2[i];
double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0;
fNewSeparationSum += fDistance;
// update the separationsum of cluster i.
fSeparationSums[i] += (fDistance - fVal1 - fVal2);
fSeparations[i] = fSeparationSums[i] / (nClusters -2);
fRow1[i] = fDistance;
fDist[i][iMin1] = fDistance;
}
}
fSeparationSums[iMin1] = fNewSeparationSum;
fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2);
fSeparationSums[iMin2] = 0;
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
int iPrev = iMin2;
// since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop should be save
while (nClusterID[iPrev].size() == 0) {
iPrev--;
}
nNextActive[iPrev] = nNextActive[iMin2];
} else {
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
break;
}
}
for (int i = 0; i < n; i++) {
if (nClusterID[i].size() > 0) {
for (int j = i+1; j < n; j++) {
if (nClusterID[j].size() > 0) {
double fDist1 = fDist[i][j];
if(nClusterID[i].size() == 1) {
merge(i,j,fDist1,0,nClusterID, clusterNodes);
} else if (nClusterID[j].size() == 1) {
merge(i,j,0,fDist1,nClusterID, clusterNodes);
} else {
merge(i,j,fDist1/2.0,fDist1/2.0,nClusterID, clusterNodes);
}
break;
}
}
}
}
} // neighborJoining
/** Perform clustering using a link method
* This implementation uses a priority queue resulting in a O(n^2 log(n)) algorithm
* @param nClusters number of clusters
* @param nClusterID
* @param clusterNodes
*/
void doLinkClustering(int nClusters, Vector[] nClusterID, Node [] clusterNodes) {
int nInstances = m_instances.numInstances();
PriorityQueue queue = new PriorityQueue(nClusters*nClusters/2, new TupleComparator());
double [][] fDistance0 = new double[nClusters][nClusters];
double [][] fClusterDistance = null;
if (m_bDebug) {
fClusterDistance = new double[nClusters][nClusters];
}
for (int i = 0; i < nClusters; i++) {
fDistance0[i][i] = 0;
for (int j = i+1; j < nClusters; j++) {
fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
fDistance0[j][i] = fDistance0[i][j];
queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1));
if (m_bDebug) {
fClusterDistance[i][j] = fDistance0[i][j];
fClusterDistance[j][i] = fDistance0[i][j];
}
}
}
while (nClusters > m_nNumClusters) {
int iMin1 = -1;
int iMin2 = -1;
// find closest two clusters
if (m_bDebug) {
/* simple but inefficient implementation */
double fMinDistance = Double.MAX_VALUE;
for (int i = 0; i < nInstances; i++) {
if (nClusterID[i].size()>0) {
for (int j = i+1; j < nInstances; j++) {
if (nClusterID[j].size()>0) {
double fDist = fClusterDistance[i][j];
if (fDist < fMinDistance) {
fMinDistance = fDist;
iMin1 = i;
iMin2 = j;
}
}
}
}
}
merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID, clusterNodes);
} else {
// use priority queue to find next best pair to cluster
Tuple t;
do {
t = queue.poll();
} while (t!=null && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2].size() != t.m_nClusterSize2));
iMin1 = t.m_iCluster1;
iMin2 = t.m_iCluster2;
merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes);
}
// merge clusters
// update distances & queue
for (int i = 0; i < nInstances; i++) {
if (i != iMin1 && nClusterID[i].size()!=0) {
int i1 = Math.min(iMin1,i);
int i2 = Math.max(iMin1,i);
double fDistance = getDistance(fDistance0, nClusterID[i1], nClusterID[i2]);
if (m_bDebug) {
fClusterDistance[i1][i2] = fDistance;
fClusterDistance[i2][i1] = fDistance;
}
queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(), nClusterID[i2].size()));
}
}
nClusters--;
}
} // doLinkClustering
void merge(int iMin1, int iMin2, double fDist1, double fDist2, Vector[] nClusterID, Node [] clusterNodes) {
if (m_bDebug) {
System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " " + fDist2);
}
if (iMin1 > iMin2) {
int h = iMin1; iMin1 = iMin2; iMin2 = h;
double f = fDist1; fDist1 = fDist2; fDist2 = f;
}
nClusterID[iMin1].addAll(nClusterID[iMin2]);
nClusterID[iMin2].removeAllElements();
// track hierarchy
Node node = new Node();
if (clusterNodes[iMin1] == null) {
node.m_iLeftInstance = iMin1;
} else {
node.m_left = clusterNodes[iMin1];
clusterNodes[iMin1].m_parent = node;
}
if (clusterNodes[iMin2] == null) {
node.m_iRightInstance = iMin2;
} else {
node.m_right = clusterNodes[iMin2];
clusterNodes[iMin2].m_parent = node;
}
if (m_bDistanceIsBranchLength) {
node.setLength(fDist1, fDist2);
} else {
node.setHeight(fDist1, fDist2);
}
clusterNodes[iMin1] = node;
} // merge
/** calculate distance the first time when setting up the distance matrix **/
double getDistance0(Vector cluster1, Vector cluster2) {
double fBestDist = Double.MAX_VALUE;
switch (m_nLinkType) {
case SINGLE:
case NEIGHBOR_JOINING:
case CENTROID:
case COMPLETE:
case ADJCOMLPETE:
case AVERAGE:
case MEAN:
// set up two instances for distance function
Instance instance1 = (Instance) m_instances.instance(cluster1.elementAt(0)).copy();
Instance instance2 = (Instance) m_instances.instance(cluster2.elementAt(0)).copy();
fBestDist = m_DistanceFunction.distance(instance1, instance2);
break;
case WARD:
{
// finds the distance of the change in caused by merging the cluster.
// The information of a cluster is calculated as the error sum of squares of the
// centroids of the cluster and its members.
double ESS1 = calcESS(cluster1);
double ESS2 = calcESS(cluster2);
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
double ESS = calcESS(merged);
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
}
break;
}
return fBestDist;
} // getDistance0
/** calculate the distance between two clusters
* @param cluster1 list of indices of instances in the first cluster
* @param cluster2 dito for second cluster
* @return distance between clusters based on link type
*/
double getDistance(double [][] fDistance, Vector cluster1, Vector cluster2) {
double fBestDist = Double.MAX_VALUE;
switch (m_nLinkType) {
case SINGLE:
// find single link distance aka minimum link, which is the closest distance between
// any item in cluster1 and any item in cluster2
fBestDist = Double.MAX_VALUE;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fBestDist > fDist) {
fBestDist = fDist;
}
}
}
break;
case COMPLETE:
case ADJCOMLPETE:
// find complete link distance aka maximum link, which is the largest distance between
// any item in cluster1 and any item in cluster2
fBestDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fBestDist < fDist) {
fBestDist = fDist;
}
}
}
if (m_nLinkType == COMPLETE) {
break;
}
// calculate adjustment, which is the largest within cluster distance
double fMaxDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = i+1; j < cluster1.size(); j++) {
int i2 = cluster1.elementAt(j);
double fDist = fDistance[i1][i2];
if (fMaxDist < fDist) {
fMaxDist = fDist;
}
}
}
for (int i = 0; i < cluster2.size(); i++) {
int i1 = cluster2.elementAt(i);
for (int j = i+1; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fMaxDist < fDist) {
fMaxDist = fDist;
}
}
}
fBestDist -= fMaxDist;
break;
case AVERAGE:
// finds average distance between the elements of the two clusters
fBestDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
fBestDist += fDistance[i1][i2];
}
}
fBestDist /= (cluster1.size() * cluster2.size());
break;
case MEAN:
{
// calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
fBestDist = 0;
for (int i = 0; i < merged.size(); i++) {
int i1 = merged.elementAt(i);
for (int j = i+1; j < merged.size(); j++) {
int i2 = merged.elementAt(j);
fBestDist += fDistance[i1][i2];
}
}
int n = merged.size();
fBestDist /= (n*(n-1.0)/2.0);
}
break;
case CENTROID:
// finds the distance of the centroids of the clusters
double [] fValues1 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster1.size(); i++) {
Instance instance = m_instances.instance(cluster1.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] += instance.value(j);
}
}
double [] fValues2 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster2.size(); i++) {
Instance instance = m_instances.instance(cluster2.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues2[j] += instance.value(j);
}
}
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] /= cluster1.size();
fValues2[j] /= cluster2.size();
}
// set up two instances for distance function
Instance instance1 = (Instance) m_instances.instance(0).copy();
Instance instance2 = (Instance) m_instances.instance(0).copy();
for (int j = 0; j < m_instances.numAttributes(); j++) {
instance1.setValue(j, fValues1[j]);
instance2.setValue(j, fValues2[j]);
}
fBestDist = m_DistanceFunction.distance(instance1, instance2);
break;
case WARD:
{
// finds the distance of the change in caused by merging the cluster.
// The information of a cluster is calculated as the error sum of squares of the
// centroids of the cluster and its members.
double ESS1 = calcESS(cluster1);
double ESS2 = calcESS(cluster2);
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
double ESS = calcESS(merged);
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
}
break;
}
return fBestDist;
} // getDistance
/** calculated error sum-of-squares for instances wrt centroid **/
double calcESS(Vector cluster) {
double [] fValues1 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster.size(); i++) {
Instance instance = m_instances.instance(cluster.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] += instance.value(j);
}
}
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] /= cluster.size();
}
// set up two instances for distance function
Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy();
for (int j = 0; j < m_instances.numAttributes(); j++) {
centroid.setValue(j, fValues1[j]);
}
double fESS = 0;
for (int i = 0; i < cluster.size(); i++) {
Instance instance = m_instances.instance(cluster.elementAt(i));
fESS += m_DistanceFunction.distance(centroid, instance);
}
return fESS / cluster.size();
} // calcESS
@Override
/** instances are assigned a cluster by finding the instance in the training data
* with the closest distance to the instance to be clustered. The cluster index of
* the training data point is taken as the cluster index.
*/
public int clusterInstance(Instance instance) throws Exception {
if (m_instances.numInstances() == 0) {
return 0;
}
double fBestDist = Double.MAX_VALUE;
int iBestInstance = -1;
for (int i = 0; i < m_instances.numInstances(); i++) {
double fDist = m_DistanceFunction.distance(instance, m_instances.instance(i));
if (fDist < fBestDist) {
fBestDist = fDist;
iBestInstance = i;
}
}
return m_nClusterNr[iBestInstance];
}
@Override
/** create distribution with all clusters having zero probability, except the
* cluster the instance is assigned to.
*/
public double[] distributionForInstance(Instance instance) throws Exception {
if (numberOfClusters() == 0) {
double [] p = new double[1];
p[0] = 1;
return p;
}
double [] p = new double[numberOfClusters()];
p[clusterInstance(instance)] = 1.0;
return p;
}
@Override
public Capabilities getCapabilities() {
Capabilities result = new Capabilities(this);
result.disableAll();
result.enable(Capability.NO_CLASS);
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
result.enable(Capability.STRING_ATTRIBUTES);
// other
result.setMinimumNumberInstances(0);
return result;
}
@Override
public int numberOfClusters() throws Exception {
return Math.min(m_nNumClusters, m_instances.numInstances());
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
public Enumeration listOptions() {
Vector newVector = new Vector(8);
newVector.addElement(new Option(
"\tIf set, classifier is run in debug mode and\n"
+ "\tmay output additional info to the console",
"D", 0, "-D"));
newVector.addElement(new Option(
"\tIf set, distance is interpreted as branch length\n"
+ "\totherwise it is node height.",
"B", 0, "-B"));
newVector.addElement(new Option(
"\tnumber of clusters",
"N", 1,"-N "));
newVector.addElement(new Option(
"\tFlag to indicate the cluster should be printed in Newick format.",
"P", 0,"-P"));
newVector.addElement(
new Option(
"Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor joining)", "L", 1,
"-L [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]"));
newVector.add(new Option(
"\tDistance function to use.\n"
+ "\t(default: weka.core.EuclideanDistance)",
"A", 1,"-A "));
return newVector.elements();
}
/**
* Parses a given list of options.
*
* Valid options are:
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
m_bPrintNewick = Utils.getFlag('P', options);
String optionString = Utils.getOption('N', options);
if (optionString.length() != 0) {
Integer temp = new Integer(optionString);
setNumClusters(temp);
}
else {
setNumClusters(2);
}
setDebug(Utils.getFlag('D', options));
setDistanceIsBranchLength(Utils.getFlag('B', options));
String sLinkType = Utils.getOption('L', options);
if (sLinkType.compareTo("SINGLE") == 0) {setLinkType(new SelectedTag(SINGLE, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("COMPLETE") == 0) {setLinkType(new SelectedTag(COMPLETE, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("AVERAGE") == 0) {setLinkType(new SelectedTag(AVERAGE, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("MEAN") == 0) {setLinkType(new SelectedTag(MEAN, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("CENTROID") == 0) {setLinkType(new SelectedTag(CENTROID, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("WARD") == 0) {setLinkType(new SelectedTag(WARD, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("ADJCOMLPETE") == 0) {setLinkType(new SelectedTag(ADJCOMLPETE, TAGS_LINK_TYPE));}
if (sLinkType.compareTo("NEIGHBOR_JOINING") == 0) {setLinkType(new SelectedTag(NEIGHBOR_JOINING, TAGS_LINK_TYPE));}
String nnSearchClass = Utils.getOption('A', options);
if(nnSearchClass.length() != 0) {
String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass);
if(nnSearchClassSpec.length == 0) {
throw new Exception("Invalid DistanceFunction specification string.");
}
String className = nnSearchClassSpec[0];
nnSearchClassSpec[0] = "";
setDistanceFunction( (DistanceFunction)
Utils.forName( DistanceFunction.class,
className, nnSearchClassSpec) );
}
else {
setDistanceFunction(new EuclideanDistance());
}
Utils.checkForRemainingOptions(options);
}
/**
* Gets the current settings of the clusterer.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String [] getOptions() {
String [] options = new String [14];
int current = 0;
options[current++] = "-N";
options[current++] = "" + getNumClusters();
options[current++] = "-L";
switch (m_nLinkType) {
case (SINGLE) :options[current++] = "SINGLE";break;
case (COMPLETE) :options[current++] = "COMPLETE";break;
case (AVERAGE) :options[current++] = "AVERAGE";break;
case (MEAN) :options[current++] = "MEAN";break;
case (CENTROID) :options[current++] = "CENTROID";break;
case (WARD) :options[current++] = "WARD";break;
case (ADJCOMLPETE) :options[current++] = "ADJCOMLPETE";break;
case (NEIGHBOR_JOINING) :options[current++] = "NEIGHBOR_JOINING";break;
}
if (m_bPrintNewick) {
options[current++] = "-P";
}
if (getDebug()) {
options[current++] = "-D";
}
if (getDistanceIsBranchLength()) {
options[current++] = "-B";
}
options[current++] = "-A";
options[current++] = (m_DistanceFunction.getClass().getName() + " " +
Utils.joinOptions(m_DistanceFunction.getOptions())).trim();
while (current < options.length) {
options[current++] = "";
}
return options;
}
public String toString() {
StringBuffer buf = new StringBuffer();
int attIndex = m_instances.classIndex();
if (attIndex < 0) {
// try find a string, or last attribute otherwise
attIndex = 0;
while (attIndex < m_instances.numAttributes()-1) {
if (m_instances.attribute(attIndex).isString()) {
break;
}
attIndex++;
}
}
try {
if (m_bPrintNewick && (numberOfClusters() > 0)) {
for (int i = 0; i < m_clusters.length; i++) {
if (m_clusters[i] != null) {
buf.append("Cluster " + i + "\n");
if (m_instances.attribute(attIndex).isString()) {
buf.append(m_clusters[i].toString(attIndex));
} else {
buf.append(m_clusters[i].toString2(attIndex));
}
buf.append("\n\n");
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return buf.toString();
}
/**
* Set debugging mode.
*
* @param debug true if debug output should be printed
*/
public void setDebug(boolean debug) {
m_bDebug = debug;
}
/**
* Get whether debugging is turned on.
*
* @return true if debugging output is on
*/
public boolean getDebug() {
return m_bDebug;
}
public boolean getDistanceIsBranchLength() {return m_bDistanceIsBranchLength;}
public void setDistanceIsBranchLength(boolean bDistanceIsHeight) {m_bDistanceIsBranchLength = bDistanceIsHeight;}
public String distanceIsHeightTipText() {
return "If set to false, the distance between clusters is interpreted " +
"as the height of the node linking the clusters. This is appropriate for " +
"example for single link clustering. However, for neighbor joining, the " +
"distance is better interpreted as branch length. Set this flag to " +
"get the latter interpretation.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String debugTipText() {
return "If set to true, classifier may output additional info to " +
"the console.";
}
/**
* @return a string to describe the NumClusters
*/
public String numClustersTipText() {
return "Sets the number of clusters. " +
"If a single hierarchy is desired, set this to 1.";
}
/**
* @return a string to describe the print Newick flag
*/
public String printNewickTipText() {
return "Flag to indicate whether the cluster should be print in Newick format." +
" This can be useful for display in other programs. However, for large datasets" +
" a lot of text may be produced, which may not be a nuisance when the Newick format" +
" is not required";
}
/**
* @return a string to describe the distance function
*/
public String distanceFunctionTipText() {
return "Sets the distance function, which measures the distance between two individual. " +
"instances (or possibly the distance between an instance and the centroid of a cluster" +
"depending on the Link type).";
}
/**
* @return a string to describe the Link type
*/
public String linkTypeTipText() {
return "Sets the method used to measure the distance between two clusters.\n" +
"SINGLE:\n" +
" find single link distance aka minimum link, which is the closest distance between" +
" any item in cluster1 and any item in cluster2\n" +
"COMPLETE:\n" +
" find complete link distance aka maximum link, which is the largest distance between" +
" any item in cluster1 and any item in cluster2\n" +
"ADJCOMLPETE:\n" +
" as COMPLETE, but with adjustment, which is the largest within cluster distance\n" +
"AVERAGE:\n" +
" finds average distance between the elements of the two clusters\n" +
"MEAN: \n" +
" calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)\n" +
"CENTROID:\n" +
" finds the distance of the centroids of the clusters\n" +
"WARD:\n" +
" finds the distance of the change in caused by merging the cluster." +
" The information of a cluster is calculated as the error sum of squares of the" +
" centroids of the cluster and its members.\n" +
"NEIGHBOR_JOINING\n" +
" use neighbor joining algorithm."
;
}
/**
* This will return a string describing the clusterer.
* @return The string.
*/
public String globalInfo() {
return
"Hierarchical clustering class.\n" +
"Implements a number of classic agglomorative (i.e. bottom up) hierarchical clustering methods" +
"based on .";
}
public static void main(String [] argv) {
runClusterer(new HierarchicalClusterer(), argv);
}
@Override
public String graph() throws Exception {
if (numberOfClusters() == 0) {
return "Newick:(no,clusters)";
}
int attIndex = m_instances.classIndex();
if (attIndex < 0) {
// try find a string, or last attribute otherwise
attIndex = 0;
while (attIndex < m_instances.numAttributes()-1) {
if (m_instances.attribute(attIndex).isString()) {
break;
}
attIndex++;
}
}
String sNewick = null;
if (m_instances.attribute(attIndex).isString()) {
sNewick = m_clusters[0].toString(attIndex);
} else {
sNewick = m_clusters[0].toString2(attIndex);
}
return "Newick:" + sNewick;
}
@Override
public int graphType() {
return Drawable.Newick;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 6042 $");
}
} // class HierarchicalClusterer