1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * AbstractStringDistanceFunction.java |
---|
19 | * Copyright (C) 2008 Bruno Woltzenlogel Paleo (http://www.logic.at/people/bruno/ ; http://bruno-wp.blogspot.com/) |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.core; |
---|
24 | |
---|
25 | import weka.core.neighboursearch.PerformanceStats; |
---|
26 | |
---|
27 | /** |
---|
28 | * Represents the abstract ancestor for string-based distance functions, like |
---|
29 | * EditDistance. |
---|
30 | * |
---|
31 | * @author Bruno Woltzenlogel Paleo |
---|
32 | * @version $Revision: 5987 $ |
---|
33 | */ |
---|
34 | public abstract class AbstractStringDistanceFunction |
---|
35 | extends NormalizableDistance { |
---|
36 | |
---|
37 | /** |
---|
38 | * Constructor that doesn't set the data |
---|
39 | */ |
---|
40 | public AbstractStringDistanceFunction() { |
---|
41 | super(); |
---|
42 | } |
---|
43 | |
---|
44 | /** |
---|
45 | * Constructor that sets the data |
---|
46 | * |
---|
47 | * @param data the set of instances that will be used for |
---|
48 | * later distance comparisons |
---|
49 | */ |
---|
50 | public AbstractStringDistanceFunction(Instances data) { |
---|
51 | super(data); |
---|
52 | } |
---|
53 | |
---|
54 | |
---|
55 | /** |
---|
56 | * Updates the current distance calculated so far with the new difference |
---|
57 | * between two attributes. The difference between the attributes was |
---|
58 | * calculated with the difference(int,double,double) method. |
---|
59 | * |
---|
60 | * @param currDist the current distance calculated so far |
---|
61 | * @param diff the difference between two new attributes |
---|
62 | * @return the update distance |
---|
63 | * @see #difference(int, double, double) |
---|
64 | */ |
---|
65 | protected double updateDistance(double currDist, double diff) { |
---|
66 | return (currDist + (diff * diff)); |
---|
67 | } |
---|
68 | |
---|
69 | /** |
---|
70 | * Computes the difference between two given attribute |
---|
71 | * values. |
---|
72 | * |
---|
73 | * @param index the attribute index |
---|
74 | * @param val1 the first value |
---|
75 | * @param val2 the second value |
---|
76 | * @return the difference |
---|
77 | */ |
---|
78 | protected double difference(int index, String string1, String string2) { |
---|
79 | switch (m_Data.attribute(index).type()) { |
---|
80 | case Attribute.STRING: |
---|
81 | double diff = stringDistance(string1, string2); |
---|
82 | if (m_DontNormalize == true) { |
---|
83 | return diff; |
---|
84 | } |
---|
85 | else { |
---|
86 | if (string1.length() > string2.length()) { |
---|
87 | return diff/((double) string1.length()); |
---|
88 | } |
---|
89 | else { |
---|
90 | return diff/((double) string2.length()); |
---|
91 | } |
---|
92 | } |
---|
93 | |
---|
94 | default: |
---|
95 | return 0; |
---|
96 | } |
---|
97 | } |
---|
98 | |
---|
99 | /** |
---|
100 | * Calculates the distance between two instances. Offers speed up (if the |
---|
101 | * distance function class in use supports it) in nearest neighbour search by |
---|
102 | * taking into account the cutOff or maximum distance. Depending on the |
---|
103 | * distance function class, post processing of the distances by |
---|
104 | * postProcessDistances(double []) may be required if this function is used. |
---|
105 | * |
---|
106 | * @param first the first instance |
---|
107 | * @param second the second instance |
---|
108 | * @param cutOffValue If the distance being calculated becomes larger than |
---|
109 | * cutOffValue then the rest of the calculation is |
---|
110 | * discarded. |
---|
111 | * @param stats the performance stats object |
---|
112 | * @return the distance between the two given instances or |
---|
113 | * Double.POSITIVE_INFINITY if the distance being |
---|
114 | * calculated becomes larger than cutOffValue. |
---|
115 | */ |
---|
116 | @Override |
---|
117 | public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats stats) { |
---|
118 | double sqDistance = 0; |
---|
119 | int numAttributes = m_Data.numAttributes(); |
---|
120 | |
---|
121 | validate(); |
---|
122 | |
---|
123 | double diff; |
---|
124 | |
---|
125 | for (int i = 0; i < numAttributes; i++) { |
---|
126 | diff = 0; |
---|
127 | if (m_ActiveIndices[i]) { |
---|
128 | diff = difference(i, first.stringValue(i), second.stringValue(i)); |
---|
129 | } |
---|
130 | sqDistance = updateDistance(sqDistance, diff); |
---|
131 | if (sqDistance > (cutOffValue * cutOffValue)) return Double.POSITIVE_INFINITY; |
---|
132 | } |
---|
133 | double distance = Math.sqrt(sqDistance); |
---|
134 | return distance; |
---|
135 | } |
---|
136 | |
---|
137 | /** |
---|
138 | * Calculates the distance between two strings. |
---|
139 | * Must be implemented by any non-abstract StringDistance class |
---|
140 | * |
---|
141 | * @param stringA the first string |
---|
142 | * @param stringB the second string |
---|
143 | * @return the distance between the two given strings |
---|
144 | */ |
---|
145 | abstract double stringDistance(String stringA, String stringB); |
---|
146 | |
---|
147 | } |
---|