source: branches/MetisMQI/src/main/java/weka/core/neighboursearch/kdtrees/KMeansInpiredMethod.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 14.2 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * KMeansInpiredMethod.java
19 * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.core.neighboursearch.kdtrees;
23
24import weka.core.Instance;
25import weka.core.Instances;
26import weka.core.RevisionUtils;
27import weka.core.TechnicalInformation;
28import weka.core.TechnicalInformationHandler;
29import weka.core.TechnicalInformation.Field;
30import weka.core.TechnicalInformation.Type;
31
32/**
33 <!-- globalinfo-start -->
34 * The class that splits a node into two such that the overall sum of squared distances of points to their centres on both sides of the (axis-parallel) splitting plane is minimum.<br/>
35 * <br/>
36 * For more information see also:<br/>
37 * <br/>
38 * Ashraf Masood Kibriya (2007). Fast Algorithms for Nearest Neighbour Search. Hamilton, New Zealand.
39 * <p/>
40 <!-- globalinfo-end -->
41 *
42 <!-- technical-bibtex-start -->
43 * BibTeX:
44 * <pre>
45 * &#64;mastersthesis{Kibriya2007,
46 *    address = {Hamilton, New Zealand},
47 *    author = {Ashraf Masood Kibriya},
48 *    school = {Department of Computer Science, School of Computing and Mathematical Sciences, University of Waikato},
49 *    title = {Fast Algorithms for Nearest Neighbour Search},
50 *    year = {2007}
51 * }
52 * </pre>
53 * <p/>
54 <!-- technical-bibtex-end -->
55 *
56 <!-- options-start -->
57 <!-- options-end -->
58 *
59 * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz)
60 * @version $Revision: 5953 $
61 */
62public class KMeansInpiredMethod
63  extends KDTreeNodeSplitter
64  implements TechnicalInformationHandler {
65 
66  /** for serialization. */
67  private static final long serialVersionUID = -866783749124714304L;
68
69  /**
70   * Returns a string describing this nearest neighbour search algorithm.
71   *
72   * @return            a description of the algorithm for displaying in the
73   *                    explorer/experimenter gui
74   */
75  public String globalInfo() {
76    return 
77        "The class that splits a node into two such that the overall sum "
78      + "of squared distances of points to their centres on both sides " 
79      + "of the (axis-parallel) splitting plane is minimum.\n\n"
80      + "For more information see also:\n\n"
81      + getTechnicalInformation().toString();
82  }
83
84  /**
85   * Returns an instance of a TechnicalInformation object, containing detailed
86   * information about the technical background of this class, e.g., paper
87   * reference or book this class is based on.
88   *
89   * @return            the technical information about this class
90   */
91  public TechnicalInformation getTechnicalInformation() {
92    TechnicalInformation result;
93
94    result = new TechnicalInformation(Type.MASTERSTHESIS);
95    result.setValue(Field.AUTHOR, "Ashraf Masood Kibriya");
96    result.setValue(Field.TITLE, "Fast Algorithms for Nearest Neighbour Search");
97    result.setValue(Field.YEAR, "2007");
98    result.setValue(Field.SCHOOL, "Department of Computer Science, School of Computing and Mathematical Sciences, University of Waikato");
99    result.setValue(Field.ADDRESS, "Hamilton, New Zealand");
100
101    return result;
102  }
103
104  /**
105   * Splits a node into two such that the overall sum of squared distances
106   * of points to their centres on both sides of the (axis-parallel)
107   * splitting plane is minimum. The two nodes created after the whole
108   * splitting are correctly initialised. And, node.left and node.right
109   * are set appropriately.
110   * @param node The node to split.
111   * @param numNodesCreated The number of nodes that so far have been
112   * created for the tree, so that the newly created nodes are
113   * assigned correct/meaningful node numbers/ids.
114   * @param nodeRanges The attributes' range for the points inside
115   * the node that is to be split.
116   * @param universe The attributes' range for the whole
117   * point-space.
118   * @throws Exception If there is some problem in splitting the
119   * given node.
120   */
121  public void splitNode(KDTreeNode node, int numNodesCreated,
122      double[][] nodeRanges, double[][] universe) throws Exception {
123
124    correctlyInitialized();
125
126    int splitDim = -1;
127    double splitVal = Double.NEGATIVE_INFINITY;
128
129    double leftAttSum[] = new double[m_Instances.numAttributes()], 
130           rightAttSum[] = new double[m_Instances.numAttributes()], 
131           leftAttSqSum[] = new double[m_Instances.numAttributes()], 
132           rightAttSqSum[] = new double[m_Instances.numAttributes()], 
133           rightSqMean, leftSqMean, leftSqSum, rightSqSum, 
134           minSum = Double.POSITIVE_INFINITY, val;
135
136    for (int dim = 0; dim < m_Instances.numAttributes(); dim++) {
137      // m_MaxRelativeWidth in KDTree ensure there'll be atleast one dim with
138      // width > 0.0
139      if (node.m_NodeRanges[dim][WIDTH] == 0.0
140          || dim == m_Instances.classIndex())
141        continue;
142
143      quickSort(m_Instances, m_InstList, dim, node.m_Start, node.m_End);
144
145      for (int i = node.m_Start; i <= node.m_End; i++) {
146        for (int j = 0; j < m_Instances.numAttributes(); j++) {
147          if (j == m_Instances.classIndex())
148            continue;
149          val = m_Instances.instance(m_InstList[i]).value(j);
150          if (m_NormalizeNodeWidth) {
151            if (Double.isNaN(universe[j][MIN])
152                || universe[j][MIN] == universe[j][MAX])
153              val = 0.0;
154            else
155              val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing
156                                                                      // value
157          }
158          if (i == node.m_Start) {
159            leftAttSum[j] = rightAttSum[j] = leftAttSqSum[j] = rightAttSqSum[j] = 0.0;
160          }
161          rightAttSum[j] += val;
162          rightAttSqSum[j] += val * val;
163        }
164      }
165
166      for (int i = node.m_Start; i <= node.m_End - 1; i++) {
167        Instance inst = m_Instances.instance(m_InstList[i]);
168        leftSqSum = rightSqSum = 0.0;
169        for (int j = 0; j < m_Instances.numAttributes(); j++) {
170          if (j == m_Instances.classIndex())
171            continue;
172          val = inst.value(j);
173
174          if (m_NormalizeNodeWidth) {
175            if (Double.isNaN(universe[j][MIN])
176                || universe[j][MIN] == universe[j][MAX])
177              val = 0.0;
178            else
179              val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing
180                                                                      // value
181          }
182
183          leftAttSum[j] += val;
184          rightAttSum[j] -= val;
185          leftAttSqSum[j] += val * val;
186          rightAttSqSum[j] -= val * val;
187          leftSqMean = leftAttSum[j] / (i - node.m_Start + 1);
188          leftSqMean *= leftSqMean;
189          rightSqMean = rightAttSum[j] / (node.m_End - i);
190          rightSqMean *= rightSqMean;
191
192          leftSqSum += leftAttSqSum[j] - (i - node.m_Start + 1) * leftSqMean;
193          rightSqSum += rightAttSqSum[j] - (node.m_End - i) * rightSqMean;
194        }
195
196        if (minSum > (leftSqSum + rightSqSum)) {
197          minSum = leftSqSum + rightSqSum;
198
199          if (i < node.m_End)
200            splitVal = (m_Instances.instance(m_InstList[i]).value(dim) + m_Instances
201                .instance(m_InstList[i + 1]).value(dim)) / 2;
202          else
203            splitVal = m_Instances.instance(m_InstList[i]).value(dim);
204
205          splitDim = dim;
206        }
207      }// end for instance i
208    }// end for attribute dim
209
210    int rightStart = rearrangePoints(m_InstList, node.m_Start, node.m_End,
211        splitDim, splitVal);
212
213    if (rightStart == node.m_Start || rightStart > node.m_End) {
214      System.out.println("node.m_Start: " + node.m_Start + " node.m_End: "
215          + node.m_End + " splitDim: " + splitDim + " splitVal: " + splitVal
216          + " node.min: " + node.m_NodeRanges[splitDim][MIN] + " node.max: "
217          + node.m_NodeRanges[splitDim][MAX] + " node.numInstances: "
218          + node.numInstances());
219
220      if (rightStart == node.m_Start)
221        throw new Exception("Left child is empty in node " + node.m_NodeNumber
222            + ". Not possible with "
223            + "KMeanInspiredMethod splitting method. Please " + "check code.");
224      else
225        throw new Exception("Right child is empty in node " + node.m_NodeNumber
226            + ". Not possible with "
227            + "KMeansInspiredMethod splitting method. Please " + "check code.");
228    }
229
230    node.m_SplitDim = splitDim;
231    node.m_SplitValue = splitVal;
232    node.m_Left = new KDTreeNode(numNodesCreated + 1, node.m_Start,
233        rightStart - 1, m_EuclideanDistance.initializeRanges(m_InstList,
234            node.m_Start, rightStart - 1));
235    node.m_Right = new KDTreeNode(numNodesCreated + 2, rightStart, node.m_End,
236        m_EuclideanDistance
237            .initializeRanges(m_InstList, rightStart, node.m_End));
238  }
239
240  /**
241   * Partitions the instances around a pivot. Used by quicksort and
242   * kthSmallestValue.
243   *
244   * @param insts       The instances on which the tree is (or is
245   * to be) built.
246   * @param index The master index array containing indices
247   * of the instances.
248   * @param attidx The attribution/dimension based on which
249   * the instances should be partitioned.
250   * @param l   The begining index of the portion of master index
251   * array that should be partitioned.
252   * @param r   The end index of the portion of master index array
253   * that should be partitioned.
254   * @return the index of the middle element
255   */
256  protected static int partition(Instances insts, int[] index, int attidx, int l, int r) {
257   
258    double pivot = insts.instance(index[(l + r) / 2]).value(attidx);
259    int help;
260
261    while (l < r) {
262      while ((insts.instance(index[l]).value(attidx) < pivot) && (l < r)) {
263        l++;
264      }
265      while ((insts.instance(index[r]).value(attidx) > pivot) && (l < r)) {
266        r--;
267      }
268      if (l < r) {
269        help = index[l];
270        index[l] = index[r];
271        index[r] = help;
272        l++;
273        r--;
274      }
275    }
276    if ((l == r) && (insts.instance(index[r]).value(attidx) > pivot)) {
277      r--;
278    } 
279
280    return r;
281  }
282 
283  /**
284   * Sorts the instances according to the given attribute/dimension.
285   * The sorting is done on the master index array and not on the
286   * actual instances object.
287   *
288   * @param insts The instances on which the tree is (or is
289   * to be) built.
290   * @param indices The master index array containing indices
291   * of the instances.
292   * @param attidx The dimension/attribute based on which
293   * the instances should be sorted.
294   * @param left The begining index of the portion of the master
295   * index array that needs to be sorted.
296   * @param right The end index of the portion of the master index
297   * array that needs to be sorted.
298   */
299  protected static void quickSort(Instances insts, int[] indices, int attidx, int left, int right) {
300
301    if (left < right) {
302      int middle = partition(insts, indices, attidx, left, right);
303      quickSort(insts, indices, attidx, left, middle);
304      quickSort(insts, indices, attidx, middle + 1, right);
305    }
306  } 
307
308  /**
309   * Method to validate the sorting done by quickSort().
310   *
311   * @param insts The instances on which the tree is (or is
312   * to be) built.
313   * @param indices The master index array containing indices
314   * of the instances.
315   * @param attidx The dimension/attribute based on which
316   * the instances should be sorted.
317   * @param start The start of the portion in master index
318   * array that needs to be sorted.
319   * @param end The end of the portion in master index
320   * array that needs to be sorted.
321   * @throws Exception If the indices of the instances
322   * are not in sorted order.
323   */
324  private static void checkSort(Instances insts, int[] indices, int attidx, 
325                               int start, int end) throws Exception {
326    for(int i=start+1; i<=end; i++) {
327      if( insts.instance(indices[i-1]).value(attidx) > 
328          insts.instance(indices[i]).value(attidx) ) {
329        System.out.println("value[i-1]: "+insts.instance(indices[i-1]).value(attidx));
330        System.out.println("value[i]: "+insts.instance(indices[i]).value(attidx));
331        System.out.println("indices[i-1]: "+indices[i-1]);
332        System.out.println("indices[i]: "+indices[i]);
333        System.out.println("i: "+i);
334        if(insts.instance(indices[i-1]).value(attidx) > insts.instance(indices[i]).value(attidx))
335          System.out.println("value[i-1] > value[i]");
336       
337        throw new Exception("Indices not sorted correctly.");
338      }//end if
339    }
340  }
341 
342  /**
343   * Re-arranges the indices array so that in the portion of the array
344   * belonging to the node to be split, the points <= to the splitVal
345   * are on the left of the portion and those > the splitVal are on the right.
346   *
347   * @param indices The master index array.
348   * @param startidx The begining index of portion of indices that needs
349   * re-arranging.
350   * @param endidx The end index of portion of indices that needs
351   * re-arranging.
352   * @param splitDim The split dimension/attribute.
353   * @param splitVal The split value.
354   * @return The startIdx of the points > the splitVal (the points
355   * belonging to the right child of the node).
356   */
357  protected int rearrangePoints(int[] indices, final int startidx, final int endidx,
358                              final int splitDim, final double splitVal) {
359   
360    int tmp, left = startidx - 1;
361    for (int i = startidx; i <= endidx; i++) {
362      if (m_EuclideanDistance.valueIsSmallerEqual(m_Instances
363          .instance(indices[i]), splitDim, splitVal)) {
364        left++;
365        tmp = indices[left];
366        indices[left] = indices[i];
367        indices[i] = tmp;
368      }// end valueIsSmallerEqual
369    }// endfor
370    return left + 1;
371  }
372 
373  /**
374   * Returns the revision string.
375   *
376   * @return            the revision
377   */
378  public String getRevision() {
379    return RevisionUtils.extract("$Revision: 5953 $");
380  }
381}
Note: See TracBrowser for help on using the repository browser.