00001 #include "mm/ug_bitext.h"
00002 #include <boost/format.hpp>
00003 using namespace std;
00004 using namespace Moses;
00005 using namespace ugdiss;
00006 using namespace sapt;
00007
00008 typedef L2R_Token<SimpleWordId> Token;
00009 typedef mmTtrack<Token> ttrack_t;
00010 typedef mmTSA<Token> tsa_t;
00011
00012 TokenIndex V1,V2;
00013 boost::shared_ptr<ttrack_t> T1,T2;
00014 tsa_t I1,I2;
00015
00016 float lbop_level = .05;
00017 #define smooth 1
00018 namespace stats
00019 {
00020 using namespace Moses;
00021 using namespace sapt;
00022 float
00023 pmi(size_t j,size_t m1, size_t m2, size_t N)
00024 {
00025 #if smooth
00026 float p1 = lbop(N,m1,lbop_level);
00027 float p2 = lbop(N,m2,lbop_level);
00028 float p12 = lbop(N,j,lbop_level);
00029 return log(p12) - log(p1) - log(p2);
00030 #else
00031 return log(j) + log(N) - log(m1) - log(m2);
00032 #endif
00033 }
00034
00035 float
00036 npmi(size_t j,size_t m1, size_t m2, size_t N)
00037 {
00038 #if smooth
00039 float p1 = lbop(N,m1,lbop_level);
00040 float p2 = lbop(N,m2,lbop_level);
00041 float p12 = lbop(N,j,lbop_level);
00042 return (log(p12) - log(p1) - log(p2)) / -log(p12);
00043 #else
00044 return pmi(j,m1,m2,N) / (log(N) - log(j));
00045 #endif
00046 }
00047
00048 float
00049 mi(size_t j,size_t m1, size_t m2, size_t N)
00050 {
00051 float ret = 0;
00052 if (j) ret += float(j)/N * pmi(j,m1,m2,N);
00053 if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N);
00054 if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N);
00055 if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N);
00056 return ret;
00057 }
00058 }
00059
00060 struct SinglePhrase
00061 {
00062 typedef map<uint64_t,SPTR<SinglePhrase> > cache_t;
00063 uint64_t pid;
00064 vector<ttrack::Position> occs;
00065 };
00066
00067
00068 struct PPair
00069 {
00070 struct score_t;
00071 uint64_t p1,p2;
00072 ushort s1,e1,s2,e2;
00073 int parent;
00074
00075 struct stats_t
00076 {
00077 typedef map<pair<uint64_t,uint64_t>, SPTR<stats_t> > cache_t;
00078 size_t m1,m2,j;
00079 float npmi;
00080 float pmi;
00081 float mi;
00082 float score;
00083
00084 void
00085 set(vector<ttrack::Position> const& o1,
00086 vector<ttrack::Position> const& o2,
00087 size_t const N)
00088 {
00089 m1 = m2 = j = 0;
00090 size_t i1=0,i2=0;
00091 while (i1 < o1.size() && i2 < o2.size())
00092 {
00093 if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
00094 if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
00095
00096 if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
00097 else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
00098 else { ++i2; ++m2; }
00099 }
00100
00101
00102
00103
00104
00105 m1 = 1; m2 = 1;
00106 for (i1=1; i1 < o1.size(); ++i1)
00107 if (o1[i1-1].sid != o1[i1].sid) ++m1;
00108 for (i2=1; i2 < o2.size(); ++i2)
00109 if (o2[i2-1].sid != o2[i2].sid) ++m2;
00110
00111 this->mi = stats::mi(j,m1,m2,N);
00112 this->pmi = stats::pmi(j,m1,m2,N);
00113 this->npmi = stats::npmi(j,m1,m2,N);
00114
00115
00116 this->score = npmi;
00117 }
00118 } stats;
00119
00120 PPair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
00121 : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
00122
00123
00124 bool
00125 operator<(PPair const& other) const
00126 {
00127 return (this->stats.score == other.stats.score
00128 ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
00129 : (this->stats.score > other.stats.score));
00130 }
00131
00132 size_t len1() const { return e1 - s1; }
00133 size_t len2() const { return e2 - s2; }
00134 bool includes(PPair const& o) const
00135 {
00136 return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2;
00137 }
00138
00139 };
00140
00141 SinglePhrase::cache_t cache1,cache2;
00142 PPair::stats_t::cache_t ppcache;
00143
00144
00145 struct SortByPositionInCorpus
00146 {
00147 bool
00148 operator()(ttrack::Position const& a,
00149 ttrack::Position const& b) const
00150 {
00151 return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
00152 }
00153 };
00154
00155
00156 void
00157 getoccs(tsa_t::tree_iterator const& m,
00158 vector<ttrack::Position>& occs)
00159 {
00160 occs.clear();
00161 occs.reserve(m.approxOccurrenceCount()+10);
00162 tsa::ArrayEntry I(m.lower_bound(-1));
00163 char const* stop = m.upper_bound(-1);
00164 do {
00165 m.root->readEntry(I.next,I);
00166 occs.push_back(I);
00167 } while (I.next != stop);
00168 sort(occs.begin(),occs.end(),SortByPositionInCorpus());
00169 }
00170
00171 void
00172 lookup_phrases(vector<id_type> const& snt,
00173 TokenIndex& V, ttrack_t const& T,
00174 tsa_t const& I, SinglePhrase::cache_t& cache,
00175 vector<vector<SPTR<SinglePhrase> > >& dest)
00176 {
00177 dest.resize(snt.size());
00178 for (size_t i = 0; i < snt.size(); ++i)
00179 {
00180 tsa_t::tree_iterator m(&I);
00181 dest[i].clear();
00182 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
00183 {
00184 if (m.approxOccurrenceCount() < 3) break;
00185
00186 SPTR<SinglePhrase>& o = cache[m.getPid()];
00187 if (!o)
00188 {
00189 o.reset(new SinglePhrase());
00190 o->pid = m.getPid();
00191 getoccs(m,o->occs);
00192 }
00193 dest[i].push_back(o);
00194 }
00195 }
00196 }
00197
00198 struct
00199 RowIndexSorter
00200 {
00201 vector<vector<float> > const& M;
00202 size_t const my_col;
00203 RowIndexSorter(vector<vector<float> > const& m, size_t const c)
00204 : M(m), my_col(c) { }
00205
00206 template<typename T>
00207 bool
00208 operator()(T const& a, T const& b) const
00209 {
00210 return M.at(a).at(my_col) > M.at(b).at(my_col);
00211 }
00212 };
00213
00214 struct
00215 ColIndexSorter
00216 {
00217 vector<vector<float> > const& M;
00218 size_t const my_row;
00219 ColIndexSorter(vector<vector<float> > const& m, size_t const r)
00220 : M(m), my_row(r) { }
00221
00222 template<typename T>
00223 bool
00224 operator()(T const& a, T const& b) const
00225 {
00226 return M.at(my_row).at(a) > M[my_row].at(b);
00227 }
00228
00229 };
00230
00231 int main(int argc, char* argv[])
00232 {
00233 string base = argv[1];
00234 string L1 = argv[2];
00235 string L2 = argv[3];
00236
00237 T1.reset(new ttrack_t());
00238 T2.reset(new ttrack_t());
00239
00240 V1.open(base + L1 + ".tdx");
00241 T1->open(base + L1 + ".mct");
00242 I1.open(base + L1 + ".sfa", T1);
00243
00244 V2.open(base + L2 + ".tdx");
00245 T2->open(base + L2 + ".mct");
00246 I2.open(base + L2 + ".sfa", T2);
00247
00248 tsa_t::tree_iterator m1(&I1);
00249 tsa_t::tree_iterator m2(&I1);
00250 string line1, line2;
00251 while (getline(cin,line1) and getline(cin,line2))
00252 {
00253 cout << "\n" << line1 << "\n" << line2 << endl;
00254 vector<vector<SPTR<SinglePhrase> > > M1,M2;
00255 vector<id_type> snt1,snt2;
00256 V1.fillIdSeq(line1,snt1);
00257 V2.fillIdSeq(line2,snt2);
00258 lookup_phrases(snt1,V1,*T1,I1,cache1,M1);
00259 lookup_phrases(snt2,V2,*T2,I2,cache2,M2);
00260
00261 vector<PPair> pp_all, pp_good;
00262 vector<int> a1(snt1.size(),-1);
00263 vector<int> a2(snt2.size(),-1);
00264
00265 vector<vector<int> > z1(snt1.size(),vector<int>(snt1.size(),-1));
00266 vector<vector<int> > z2(snt2.size(),vector<int>(snt2.size(),-1));
00267 vector<vector<vector<PPair> > >ppm1(M1.size()),ppm2(M2.size());
00268 vector<vector<float> > M(snt1.size(), vector<float>(snt2.size(),0));
00269 vector<vector<size_t> > best1(snt1.size()), best2(snt2.size());
00270 for (size_t i1 = 0; i1 < M1.size(); ++i1)
00271 {
00272 PPair pp;
00273 pp.s1 = i1;
00274 ppm1[i1].resize(M1[i1].size());
00275 for (size_t i2 = 0; i2 < M2.size(); ++i2)
00276 {
00277 pp.s2 = i2;
00278 pp.stats.j = 1;
00279 ppm2[i2].resize(M2[i2].size());
00280 for (size_t k1 = 0; k1 < M1[i1].size(); ++k1)
00281 {
00282 pp.e1 = i1 + k1 + 1;
00283
00284 for (size_t k2 = 0; k2 < M2[i2].size(); ++k2)
00285 {
00286 pp.e2 = i2 + k2 + 1;
00287 SPTR<PPair::stats_t> & s
00288 = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)];
00289 if (!s)
00290 {
00291 s.reset(new PPair::stats_t());
00292 s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size());
00293 }
00294 pp.stats = *s;
00295 if (pp.stats.j == 0) break;
00296
00297
00298 size_t J = pp.stats.j * 100;
00299 if (pp.stats.score > 0
00300 && J >= pp.stats.m1
00301 && J > pp.stats.m2)
00302 { pp_all.push_back(pp); }
00303 }
00304 }
00305 }
00306 }
00307 sort(pp_all.begin(),pp_all.end());
00308 #if 0
00309 BOOST_FOREACH(PPair const& pp,pp_all)
00310 {
00311 if (pp.stats.npmi < 0) continue;
00312 for (size_t r = pp.s1; r < pp.e1; ++r)
00313 for (size_t c = pp.s2; c < pp.e2; ++c)
00314 {
00315
00316 M[r][c] += log(1-pp.stats.mi);
00317 }
00318 }
00319 for (size_t r = 0; r < M.size(); ++r)
00320 for (size_t c = 0; c < M[r].size(); ++c)
00321 M[r][c] = 1.-exp(M[r][c]);
00322 for (size_t r = 0; r < best1.size(); ++r)
00323 {
00324 best1[r].resize(snt2.size());
00325 for (size_t c = 0; c < best1[r].size(); ++c)
00326 best1[r][c] = c;
00327 sort(best1[r].begin(),best1[r].end(),ColIndexSorter(M,r));
00328 }
00329 for (size_t c = 0; c < best2.size(); ++c)
00330 {
00331 best2[c].resize(snt1.size());
00332 for (size_t r = 0; r < best2[c].size(); ++r)
00333 best2[c][r] = r;
00334 sort(best2[c].begin(),best2[c].end(),RowIndexSorter(M,c));
00335 }
00336 for (size_t r = 0; r < best1.size(); ++r)
00337 {
00338 cout << V1[snt1[r]] << ":";
00339 for (size_t i = 0; i < min(3UL,M[r].size()); ++i)
00340 {
00341 size_t k = best1[r][i];
00342
00343 cout << " " << k << ":" << V2[snt2[k]] << " " << M[r][k];
00344 }
00345 cout << endl;
00346 }
00347 #endif
00348 #if 0
00349 for (size_t k = 1; k < pp_all.size(); ++k)
00350 for (size_t i = k; i--;)
00351 if (pp_all[i].s1 >= pp_all[k].s1 &&
00352 pp_all[i].e1 <= pp_all[k].e1 &&
00353 pp_all[i].s2 >= pp_all[k].s2 &&
00354 pp_all[i].e2 <= pp_all[k].e2)
00355 pp_all[i].stats.score += pp_all[k].stats.score;
00356 sort(pp_all.begin(),pp_all.end());
00357 #endif
00358
00359 #if 1
00360 vector<int> assoc1(snt1.size(),-1), assoc2(snt2.size(),-1);
00361 for (size_t p = 0; p < pp_all.size(); ++p)
00362 {
00363 PPair const& x = pp_all[p];
00364
00365
00366
00367 for (size_t i = x.s1; i < x.e1; ++i)
00368 {
00369 if (assoc1[i] < 0)
00370 assoc1[i] = p;
00371 else
00372 {
00373
00374
00375
00376 }
00377 }
00378 for (size_t i = x.s2; i < x.e2; ++i)
00379 {
00380 if (assoc2[i] < 0)
00381 assoc2[i] = p;
00382 else
00383 {
00384
00385
00386
00387 }
00388 }
00389 z1[x.s1][x.e1-1] = p;
00390 z2[x.s2][x.e2-1] = p;
00391 continue;
00392 cout << (boost::format("%.4f %.8f %.4f")
00393 % x.stats.score
00394 % x.stats.mi
00395 % x.stats.npmi);
00396 for (size_t z = x.s1; z < x.e1; ++z)
00397 cout << " " << V1[snt1[z]];
00398 cout << " :::";
00399 for (size_t z = x.s2; z < x.e2; ++z)
00400 cout << " " << V2[snt2[z]];
00401 cout << " ["
00402 << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2
00403 << "]" << endl;
00404 }
00405 vector<bool> done(pp_all.size(),false);
00406 for (size_t i = 0; i < snt1.size(); ++i)
00407 {
00408 if (assoc1[i] < 0 || done[assoc1[i]])
00409 continue;
00410
00411
00412 {
00413 done[assoc1[i]] = true;
00414 PPair& p = pp_all[assoc1[i]];
00415 for (size_t j = p.s1; j < p.e1; ++j)
00416 cout << j << ":" << V1[snt1[j]] << " ";
00417 cout << " ::: ";
00418 for (size_t j = p.s2; j < p.e2; ++j)
00419 cout << j << ":" << V2[snt2[j]] << " ";
00420 cout << "["
00421 << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
00422 << "] "<< p.stats.score << endl;
00423
00424 }
00425 }
00426 cout << endl;
00427 for (size_t i = 0; i < snt2.size(); ++i)
00428 {
00429 if (assoc2[i] < 0 || done[assoc2[i]])
00430 continue;
00431 done[assoc2[i]] = true;
00432 PPair& p = pp_all[assoc2[i]];
00433 for (size_t j = p.s1; j < p.e1; ++j)
00434 cout << j << ":" << V1[snt1[j]] << " ";
00435 cout << " ::: ";
00436 for (size_t j = p.s2; j < p.e2; ++j)
00437 cout << j << ":" << V2[snt2[j]] << " ";
00438 cout << "["
00439 << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2
00440 << "] "<< p.stats.score << endl;
00441 }
00442 #endif
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514 }
00515 }
00516