00001 #include <string>
00002 #include <cassert>
00003 #include <iomanip>
00004 #include <algorithm>
00005 #include "ug_stringdist.h"
00006
00007
00008
00009 namespace stringdist
00010 {
00011
00012 UErrorCode strip_accents(UnicodeString & trg)
00013 {
00014 UErrorCode status = U_ZERO_ERROR;
00015 static Transliterator *stripper
00016 = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
00017 UTRANS_FORWARD, status);
00018 stripper->transliterate(trg);
00019 return status;
00020 }
00021
00022 char const*
00023 StringDiff::
00024 Segment::
00025 elabel[] = { "same", "cap", "flip", "permutation",
00026 "accent", "duplication",
00027 "insertion", "deletion",
00028 "mismatch", "noinit" };
00029
00030 StringDiff::
00031 StringDiff()
00032 {}
00033
00034 StringDiff::
00035 StringDiff(string const& a, string const& b)
00036 {
00037 set_a(a);
00038 set_b(b);
00039 align();
00040 }
00041
00042 StringDiff::
00043 Segment::
00044 Segment()
00045 : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
00046 {}
00047
00048 UnicodeString const&
00049 StringDiff::
00050 set_a(string const& a)
00051 {
00052 this->a = a.c_str();
00053 return this->a;
00054 }
00055
00056 UnicodeString const&
00057 StringDiff::
00058 set_b(string const& b)
00059 {
00060 this->b = b.c_str();
00061 return this->b;
00062 }
00063
00064 UnicodeString const&
00065 StringDiff::
00066 get_a() const
00067 {
00068 return this->a;
00069 }
00070
00071 UnicodeString const&
00072 StringDiff::
00073 get_b() const
00074 {
00075 return this->b;
00076 }
00077
00078 size_t
00079 StringDiff::
00080 size()
00081 {
00082 return this->difflist.size();
00083 }
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101 void
00102 StringDiff::
00103 fillAlignmentMatrix(vector<vector<float> > & M) const
00104 {
00105 assert(a.length() && b.length());
00106 M.assign(a.length(),vector<float>(b.length(),0));
00107 int i = 0,j;
00108 while (i < b.length() && b[i] != a[0]) ++i;
00109 while (i < b.length()) M[0][i++] = 1;
00110 i = 0;
00111 while (i < a.length() && a[i] != b[0]) ++i;
00112 while (i < a.length()) M[i++][0] = 1;
00113 for (i = 1; i < a.length(); ++i)
00114 {
00115 for (j = 1; j < b.length(); ++j)
00116 {
00117 float & s = M[i][j];
00118 s = max(M[i-1][j],M[i][j-1]);
00119 if (a[i] == b[j])
00120 s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0));
00121 }
00122 }
00123 #if 0
00124 string abuf,bbuf;
00125 a.toUTF8String(abuf);
00126 b.toUTF8String(bbuf);
00127 cout << " " << bbuf[0];
00128 for (int x = 1; x < b.length(); ++x)
00129 cout << " " << bbuf[x];
00130 cout << endl;
00131 for (int x = 0; x < a.length(); ++x)
00132 {
00133 cout << abuf[x] << " ";
00134 for (int y = 0; y < b.length(); ++y)
00135 cout << int(M[x][y]) << " ";
00136 cout << endl;
00137 }
00138 #endif
00139 }
00140
00141 float
00142 fillAlignmentMatrix(UChar const* a, size_t const lenA,
00143 UChar const* b, size_t const lenB,
00144 vector<vector<float> > & M)
00145 {
00146 M.assign(lenA,vector<float>(lenB,0));
00147 assert(lenA); assert(lenB);
00148 size_t i = 0;
00149 while (i < lenB && b[i] != a[0]) ++i;
00150 while (i < lenB) M[0][i++] = 1;
00151 i = 0;
00152 while (i < lenA && a[i] != b[0]) ++i;
00153 while (i < lenA) M[i++][0] = 1;
00154 for (i = 1; i < lenA; ++i)
00155 {
00156 for (size_t j = 1; j < lenB; ++j)
00157 {
00158 float & s = M[i][j];
00159 s = max(M[i-1][j], M[i][j-1]);
00160 if (a[i] == b[j])
00161 s = max(s, M[i-1][j-1] + 1);
00162 }
00163 }
00164 return M.back().back();
00165 }
00166
00167 float
00168 levenshtein(UChar const* a, size_t const lenA,
00169 UChar const* b, size_t const lenB)
00170 {
00171 vector<vector<float> > M;
00172 fillAlignmentMatrix(a,lenA,b,lenB,M);
00173 size_t ret = 0;
00174 #define DEBUGME 0
00175 #if DEBUGME
00176 for (size_t i = 0; i < M.size(); ++i)
00177 {
00178 for (size_t j = 0; j < M[i].size(); ++j)
00179 cout << M[i][j] << " ";
00180 cout << endl;
00181 }
00182 cout << string(25,'-') << endl;
00183 #endif
00184
00185 int i = M.size() -1;
00186 int j = M.back().size() -1;
00187 int I=i, J=j;
00188 for (;i >= 0 || j >= 0; --i, --j)
00189 {
00190 I=i, J=j;
00191 if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i;
00192 if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j;
00193 size_t ilen = I >= 0 ? I - i : 0;
00194 size_t jlen = J >= 0 ? J - j : 0;
00195 ret += max(ilen,jlen);
00196 #if DEBUGME
00197 cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
00198 #endif
00199 I=i, J=j;
00200 }
00201 size_t ilen = I >= 0 ? I - i : 0;
00202 size_t jlen = J >= 0 ? J - j : 0;
00203 ret += max(ilen,jlen);
00204 #if DEBUGME
00205 cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
00206 #endif
00207 return ret;
00208 }
00209
00210
00211
00212 StringDiff::
00213 Segment::
00214 Segment(size_t const as, size_t const ae,
00215 size_t const bs, size_t const be,
00216 UnicodeString const& a,
00217 UnicodeString const& b)
00218 {
00219 dist = 0;
00220 start_a = as; end_a = ae;
00221 start_b = bs; end_b = be;
00222 if (as == ae)
00223 match = bs == be ? same : insertion;
00224 else if (bs == be)
00225 match = deletion;
00226 else if (be-bs != ae-as)
00227 {
00228 match = mismatch;
00229 dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
00230 b.getBuffer() + bs, be - bs);
00231 }
00232 else
00233 {
00234 match = same;
00235 size_t stop = ae-as;
00236 for (size_t i = 0; i < stop && match == same; ++i)
00237 if (a[as+i] != b[bs+i]) match = mismatch;
00238 if (match == mismatch)
00239 {
00240 if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs])
00241 match = flip;
00242 else
00243 {
00244 vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae);
00245 vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be);
00246 sort(x.begin(),x.end());
00247 sort(y.begin(),y.end());
00248 if (x == y) match = permutation;
00249 else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
00250 b.getBuffer() + bs, be - bs);
00251 }
00252 }
00253 }
00254 if (match == insertion)
00255 {
00256 dist = be-bs;
00257 }
00258 else if (match == deletion)
00259 {
00260 dist = ae-as;
00261 }
00262 else if (match == flip) dist = 1;
00263 else if (match == permutation) dist = ae-as-1;
00264 if (match == mismatch)
00265 {
00266 UnicodeString ax(a,as,ae-as);
00267 UnicodeString bx(b,bs,be-bs);
00268 if (ax.toLower() == bx.toLower())
00269 match = cap;
00270 else
00271 {
00272 strip_accents(ax);
00273 strip_accents(bx);
00274 if (ax == bx) match = accent;
00275 }
00276 }
00277 }
00278
00279 size_t
00280 StringDiff::
00281 align(bool force)
00282 {
00283 if (force) difflist.clear();
00284 if (difflist.size()) return 0;
00285 vector<vector<float> > M;
00286 fillAlignmentMatrix(M);
00287
00288 int i = a.length() - 1;
00289 int j = b.length() - 1;
00290 vector<int> A(a.length(), -1);
00291 vector<int> B(b.length(), -1);
00292 while (i + j)
00293 {
00294 while (i && M[i-1][j] == M[i][j]) --i;
00295 while (j && M[i][j-1] == M[i][j]) --j;
00296 if (a[i] == b[j]) { A[i] = j; B[j] = i; }
00297 if (i) --i;
00298 if (j) --j;
00299 }
00300 i = a.length() - 1;
00301 j = b.length() - 1;
00302 vector<int> A2(a.length(), -1);
00303 vector<int> B2(b.length(), -1);
00304 while (i + j)
00305 {
00306 while (j && M[i][j-1] == M[i][j]) --j;
00307 while (i && M[i-1][j] == M[i][j]) --i;
00308 if (a[i] == b[j]) { A2[i] = j; B2[j] = i; }
00309 if (i) --i;
00310 if (j) --j;
00311 }
00312 for (size_t k = 0; k < A.size(); ++k)
00313 A[k] = min(A[k],A2[k]);
00314 for (size_t k = 0; k < B.size(); ++k)
00315 B[k] = min(B[k],B2[k]);
00316
00317 if (a[i] == b[j]) { A[i] = j; B[j] = i; }
00318 i = 0;
00319 j = 0;
00320 size_t I, J;
00321 while (i < a.length() and j < b.length())
00322 {
00323 if (A[i] < 0)
00324 {
00325 I = i + 1;
00326 while (I < A.size() and A[I] < 0) ++I;
00327 if (i)
00328 { for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); }
00329 else if (I < A.size())
00330 { for (j = J = A[I]; j && B[j-1] < 0; --j); }
00331 else J = B.size();
00332 difflist.push_back(Segment(i,I,j,J,a,b));
00333 i = I; j = J;
00334 }
00335 else if (B[j] < 0)
00336 {
00337 for (J = j + 1; J < B.size() && B[J] < 0; ++J);
00338 difflist.push_back(Segment(i,i,j,J,a,b));
00339 j = J;
00340 }
00341 else
00342 {
00343 I = i;
00344 J = j;
00345 while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
00346 { ++I; ++J; }
00347 difflist.push_back(Segment(i,I,j,J,a,b));
00348 i = I; j = J;
00349 }
00350 }
00351 if (i < a.length() || j < b.length())
00352 difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
00353
00354 diffcnt.assign(noinit,0);
00355 for (size_t i = 0; i < difflist.size(); ++i)
00356 {
00357 Segment & s = difflist[i];
00358 if (s.match == insertion and
00359 ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
00360 (s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
00361 {
00362 bool sameletter = true;
00363 for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i)
00364 sameletter = b[i] == b[i-1];
00365 if (sameletter) s.match = duplication;
00366 }
00367 else if (s.match == deletion and
00368 ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
00369 (s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
00370 {
00371 bool sameletter = true;
00372 for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i)
00373 sameletter = a[i] == a[i-1];
00374 if (sameletter) s.match= duplication;
00375 }
00376 ++diffcnt[s.match];
00377 }
00378 return 0;
00379 }
00380
00381 void
00382 StringDiff::
00383 showDiff(std::ostream& out)
00384 {
00385 if (difflist.size() == 0) align();
00386 vector<size_t> fromEnd(difflist.size(),0);
00387 for (int d = difflist.size()-1; d-- > 0;)
00388 {
00389 fromEnd[d] = a.length() - difflist[d].end_a;
00390
00391
00392
00393 }
00394 for (size_t d = 0; d < difflist.size(); ++d)
00395 {
00396 Segment const& s = difflist[d];
00397 UnicodeString aseg,bseg;
00398 a.extract(s.start_a, s.end_a - s.start_a, aseg);
00399 b.extract(s.start_b, s.end_b - s.start_b, bseg);
00400 string abuf,bbuf;
00401 aseg.toUTF8String(abuf);
00402 bseg.toUTF8String(bbuf);
00403 out << abuf << " ";
00404 out << bbuf << " ";
00405 out << s.label() << " "
00406 << s.dist << " "
00407 << fromEnd[d]
00408 << endl;
00409 }
00410 }
00411
00412 char const*
00413 StringDiff::
00414 Segment::
00415 label() const
00416 {
00417 return elabel[this->match];
00418 }
00419
00420 StringDiff::Segment const&
00421 StringDiff::
00422 operator[](uint32_t const i) const
00423 {
00424 return difflist.at(i);
00425 }
00426
00427 vector<int> const&
00428 StringDiff::
00429 getFeatures() const
00430 {
00431 return diffcnt;
00432 }
00433
00434 }