00001 #include "mm/ug_bitext.h"
00002 #include <boost/format.hpp>
00003
00004 #include <unicode/translit.h>
00005 #include <unicode/utypes.h>
00006 #include <unicode/unistr.h>
00007 #include <unicode/uchar.h>
00008 #include <unicode/utf8.h>
00009 #include "moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h"
00010
00011 using namespace std;
00012 using namespace Moses;
00013 using namespace ugdiss;
00014 using namespace Moses::bitext;
00015
00016 typedef L2R_Token<SimpleWordId> Token;
00017 typedef mmTtrack<Token> ttrack_t;
00018 typedef mmTSA<Token> tsa_t;
00019 typedef vector<Moses::bitext::PhrasePair<Token> > pplist_t;
00020 typedef pair<ushort,ushort> span_t;
00021
00022 TokenIndex V1,V2;
00023 boost::shared_ptr<ttrack_t> T1,T2;
00024 tsa_t I1,I2;
00025 mmBitext<Token> BT;
00026
00027 float lbop_level = .05;
00028 #define smooth 1
00029 namespace stats
00030 {
00031 using namespace Moses::bitext;
00032 float
00033 pmi(size_t j,size_t m1, size_t m2, size_t N)
00034 {
00035 #if smooth
00036 float p1 = lbop(N,m1,lbop_level);
00037 float p2 = lbop(N,m2,lbop_level);
00038 float p12 = lbop(N,j,lbop_level);
00039 return log(p12) - log(p1) - log(p2);
00040 #else
00041 return log(j) + log(N) - log(m1) - log(m2);
00042 #endif
00043 }
00044
00045 float
00046 npmi(size_t j,size_t m1, size_t m2, size_t N)
00047 {
00048 #if smooth
00049
00050 float p1 = lbop(N,m1,lbop_level);
00051 float p2 = lbop(N,m2,lbop_level);
00052 float p12 = lbop(N,j,lbop_level);
00053 return (log(p12) - log(p1) - log(p2)) / -log(p12);
00054 #else
00055 return pmi(j,m1,m2,N) / (log(N) - log(j));
00056 #endif
00057 }
00058
00059 float
00060 mi(size_t j,size_t m1, size_t m2, size_t N)
00061 {
00062 float ret = 0;
00063 if (j) ret += float(j)/N * pmi(j,m1,m2,N);
00064 if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N);
00065 if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N);
00066 if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N);
00067 return ret;
00068 }
00069 }
00070
00071 struct SinglePhrase
00072 {
00073 typedef map<uint64_t,SPTR<SinglePhrase> > cache_t;
00074 uint64_t pid;
00075 vector<ttrack::Position> occs;
00076 };
00077
00078
00079 struct PhrasePair2
00080 {
00081 struct score_t;
00082 uint64_t p1,p2;
00083 ushort s1,e1,s2,e2;
00084 int parent;
00085
00086 struct stats_t
00087 {
00088 typedef map<pair<uint64_t,uint64_t>, SPTR<stats_t> > cache_t;
00089 size_t m1,m2,j;
00090 float npmi;
00091 float pmi;
00092 float mi;
00093 float score;
00094
00095 void
00096 set(vector<ttrack::Position> const& o1,
00097 vector<ttrack::Position> const& o2,
00098 size_t const N)
00099 {
00100 m1 = m2 = j = 0;
00101 size_t i1=0,i2=0;
00102 while (i1 < o1.size() && i2 < o2.size())
00103 {
00104 if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; }
00105 if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; }
00106
00107 if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; }
00108 else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; }
00109 else { ++i2; ++m2; }
00110 }
00111
00112
00113
00114
00115
00116 m1 = 1; m2 = 1;
00117 for (i1=1; i1 < o1.size(); ++i1)
00118 if (o1[i1-1].sid != o1[i1].sid) ++m1;
00119 for (i2=1; i2 < o2.size(); ++i2)
00120 if (o2[i2-1].sid != o2[i2].sid) ++m2;
00121
00122 this->mi = stats::mi(j,m1,m2,N);
00123 this->pmi = stats::pmi(j,m1,m2,N);
00124 this->npmi = stats::npmi(j,m1,m2,N);
00125
00126
00127 this->score = npmi;
00128 }
00129 } stats;
00130
00131 PhrasePair2(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0)
00132 : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { }
00133
00134
00135 bool
00136 operator<(PhrasePair2 const& other) const
00137 {
00138 return (this->stats.score == other.stats.score
00139 ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2)
00140 : (this->stats.score > other.stats.score));
00141 }
00142
00143 size_t len1() const { return e1 - s1; }
00144 size_t len2() const { return e2 - s2; }
00145 bool includes(PhrasePair2 const& o) const
00146 {
00147 return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2;
00148 }
00149
00150 };
00151
00152 SinglePhrase::cache_t cache1,cache2;
00153 PhrasePair2::stats_t::cache_t ppcache;
00154
00155
00156 struct SortByPositionInCorpus
00157 {
00158 bool
00159 operator()(ttrack::Position const& a,
00160 ttrack::Position const& b) const
00161 {
00162 return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset;
00163 }
00164 };
00165
00166
00167 void
00168 getoccs(tsa_t::tree_iterator const& m,
00169 vector<ttrack::Position>& occs)
00170 {
00171 occs.clear();
00172 occs.reserve(m.approxOccurrenceCount()+10);
00173 tsa::ArrayEntry I(m.lower_bound(-1));
00174 char const* stop = m.upper_bound(-1);
00175 do {
00176 m.root->readEntry(I.next,I);
00177 occs.push_back(I);
00178 } while (I.next != stop);
00179 sort(occs.begin(),occs.end(),SortByPositionInCorpus());
00180 }
00181
00182 void
00183 lookup_phrases(vector<id_type> const& snt,
00184 TokenIndex& V, ttrack_t const& T,
00185 tsa_t const& I, SinglePhrase::cache_t& cache,
00186 vector<vector<SPTR<SinglePhrase> > >& dest)
00187 {
00188 dest.resize(snt.size());
00189 for (size_t i = 0; i < snt.size(); ++i)
00190 {
00191 tsa_t::tree_iterator m(&I);
00192 dest[i].clear();
00193 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
00194 {
00195 if (m.approxOccurrenceCount() < 3) break;
00196
00197 SPTR<SinglePhrase>& o = cache[m.getPid()];
00198 if (!o)
00199 {
00200 o.reset(new SinglePhrase());
00201 o->pid = m.getPid();
00202 getoccs(m,o->occs);
00203 }
00204 dest[i].push_back(o);
00205 }
00206 }
00207 }
00208
00209
00210 struct
00211 RowIndexSorter
00212 {
00213 vector<vector<float> > const& M;
00214 size_t const my_col;
00215 RowIndexSorter(vector<vector<float> > const& m, size_t const c)
00216 : M(m), my_col(c) { }
00217
00218 template<typename T>
00219 bool
00220 operator()(T const& a, T const& b) const
00221 {
00222 return M.at(a).at(my_col) > M.at(b).at(my_col);
00223 }
00224 };
00225
00226 struct
00227 ColIndexSorter
00228 {
00229 vector<vector<float> > const& M;
00230 size_t const my_row;
00231 ColIndexSorter(vector<vector<float> > const& m, size_t const r)
00232 : M(m), my_row(r) { }
00233
00234 template<typename T>
00235 bool
00236 operator()(T const& a, T const& b) const
00237 {
00238 return M.at(my_row).at(a) > M[my_row].at(b);
00239 }
00240
00241 };
00242
00243 template<typename Token>
00244 class
00245 npmi_scorer1 : public Moses::bitext::PhrasePair<Token>::Scorer
00246 {
00247 public:
00248 float operator()(PhrasePair<Token>& pp) const
00249 {
00250 #if 0
00251 cout << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " "
00252 << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " "
00253 << pp.joint << " " << __FILE__ << ":" << __LINE__ << endl;
00254 #endif
00255 pp.good2 = ceil(pp.raw2 * float(pp.good1)/pp.raw1);
00256 size_t N = ceil(BT.T1->numTokens() * float(pp.good1)/pp.raw1);
00257 return pp.score = stats::npmi(pp.joint,pp.good1,pp.good2,N);
00258 }
00259 };
00260
00261
00262 class Alnhyp
00263 {
00264 ushort s1,s2,e1,e2;
00265 float score;
00266 };
00267
00268
00269 size_t
00270 lcs(string const a, string const b)
00271 {
00272 using namespace stringdist;
00273 if (a == b) return a.size();
00274 StringDiff diff(a,b);
00275 size_t ret = 0;
00276 size_t len = 0;
00277
00278 for (size_t i = 0; i < diff.size(); ++i)
00279 {
00280 StringDiff::Segment const& s = diff[i];
00281 if (s.match != StringDiff::same && s.match != StringDiff::cap)
00282 {
00283 if (len > ret) ret = len;
00284 len = 0;
00285 continue;
00286 }
00287 len += s.end_a - s.start_a;
00288 }
00289 if (len > ret) ret = len;
00290 return ret;
00291 }
00292
00293 size_t
00294 mapstring(string const& utf8,
00295 UnicodeString& U,
00296 vector<int>& c2w,
00297 vector<int>* wlen=NULL)
00298 {
00299 static UChar space = UnicodeString(" ")[0];
00300 assert(utf8.at(0) != ' ');
00301 U = UnicodeString(utf8.c_str()).toLower();
00302 stringdist::strip_accents(U);
00303 c2w.assign(U.length(),-1);
00304 size_t k = 0;
00305 size_t z = 0;
00306 for (int i = 0; i < U.length(); ++i)
00307 if (U[i] == space) { if (wlen) wlen->push_back(i-z); z = ++k; }
00308 else c2w[i] = k;
00309 assert(c2w.back() >= 0);
00310 if (wlen) wlen->push_back(U.length()-z);
00311 return k+1;
00312 }
00313
00314 void
00315 align_letters(UnicodeString const& A, vector<int> const& a2p,
00316 UnicodeString const& B, vector<int> const& b2p,
00317 vector<vector<int> >& W)
00318 {
00319 vector<vector<int> > M(A.length(),vector<int>(B.length(),0));
00320 for (int a = 0; a < A.length(); ++a)
00321 {
00322 for (int b = 0; b < B.length(); ++b)
00323 {
00324 if (A[a] != B[b] || a2p[a] < 0 || b2p[b] < 0)
00325 continue;
00326 M[a][b] = (a && b) ? M[a-1][b-1] + 1 : 1;
00327 int& x = W[a2p[a]][b2p[b]];
00328 x = max(x,M[a][b]);
00329 }
00330 }
00331
00332
00333
00334
00335
00336
00337
00338
00339 }
00340
00341 void
00342 map_back(vector<vector<int> > const& W,
00343 vector<vector<int> > & X,
00344 vector<uchar> const & aln)
00345 {
00346 for (size_t i = 0; i < aln.size(); i += 2)
00347 {
00348 vector<int> const& w = W.at(aln[i+1]);
00349 vector<int>& x = X.at(aln[i]);
00350 assert(x.size() == w.size());
00351 for (size_t k = 0; k < x.size(); ++k)
00352 x[k] = max(w[k],x[k]);
00353 }
00354 }
00355
00356
00357 void trymatch3(vector<PhrasePair<Token> > const& tcands,
00358 UnicodeString const& T, size_t const tlen,
00359 vector<int> const& t2p,
00360 TokenIndex const& V2, vector<vector<int> >&X)
00361 {
00362 BOOST_FOREACH(PhrasePair<Token> const& pp, tcands)
00363 {
00364 UnicodeString H; vector<int> h2p;
00365 string hstring = toString(V2, pp.start2, pp.len2);
00366 size_t hlen = mapstring(hstring,H,h2p);
00367 vector<vector<int> > W(hlen,vector<int>(tlen,0));
00368 align_letters(H, h2p, T, t2p, W);
00369 #if 0
00370 string s; S.toUTF8String(s);
00371 string h; H.toUTF8String(h);
00372 string t; T.toUTF8String(t);
00373 cout << s << endl << h << endl << t << endl;
00374 cout << slen << " " << tlen << endl;
00375 cout << "W: " << W.size() << " rows; " << W[0].size() << " cols" << endl;
00376 cout << "X: " << X.size() << " rows; " << X[0].size() << " cols" << endl;
00377 cout << "aln: ";
00378 for (size_t a = 0; a < pp.aln.size(); a +=2)
00379 cout << int(pp.aln[a]) << "-" << int(pp.aln[a+1]) << " ";
00380 cout << endl;
00381 #endif
00382 map_back(W,X,pp.aln);
00383 }
00384 }
00385
00386 void minmatch_filter(vector<vector<int> > & X,
00387 vector<int> const& len1,
00388 vector<int> const& len2)
00389 {
00390
00391 vector<int> m1(X.size(),0), m2(X.at(0).size(),0);
00392 for (size_t r = 0; r < X.size(); ++r)
00393 for (size_t c = 0; c < X[r].size(); ++c)
00394 {
00395 if (X[r][c] == 0) continue;
00396 m1[r] += X[r][c];
00397 m2[c] += X[r][c];
00398 }
00399
00400 bool sth_changed = true;
00401 while (sth_changed)
00402 {
00403 sth_changed = false;
00404 for (size_t r = 0; r < m1.size(); ++r)
00405 {
00406 if (m1[r] && m1[r] < max(2,min(5,len1[r]/2)))
00407 {
00408 sth_changed = true;
00409 for (size_t c = 0; c < X[r].size(); ++c)
00410 {
00411 m2[c] -= X[r][c];
00412 X[r][c] = 0;
00413 }
00414 m1[r] = 0;
00415 }
00416 }
00417
00418 for (size_t c = 0; c < m2.size(); ++c)
00419 {
00420 if (m2[c] && m2[c] < max(2,min(5,len2[c]/2)))
00421 {
00422 sth_changed = true;
00423 for (size_t r = 0; r < m1.size(); ++r)
00424 {
00425 m1[r] -= X[r][c];
00426 X[r][c] = 0;
00427 }
00428 m2[c] = 0;
00429 }
00430 }
00431 }
00432 }
00433
00434
00435 void
00436 trymatch2(TokenIndex& V1,
00437 TokenIndex& V2,
00438 string const& source,
00439 string const& target,
00440 vector<PhrasePair<Token> > const* const tcands,
00441 vector<vector<int> >& X)
00442
00443 {
00444 UnicodeString S,T;
00445 vector<int> t2p, s2p;
00446 vector<int> wlen_t, wlen_s;
00447 size_t slen = mapstring(source, S, s2p, &wlen_s);
00448 size_t tlen = mapstring(target, T, t2p, &wlen_t);
00449
00450 X.assign(slen,vector<int>(tlen,0));
00451 if (slen == 1 && tlen ==1 && S == T)
00452 X[0][0] = S.length();
00453 else
00454 {
00455 align_letters(S,s2p,T,t2p,X);
00456 if (tcands) trymatch3(*tcands, T, tlen, t2p, V2, X);
00457 }
00458
00459 minmatch_filter(X, wlen_s, wlen_t);
00460 bool hit = false;
00461 for (size_t r = 0; !hit && r < X.size(); ++r)
00462 for (size_t c = 0; !hit && c < X[r].size(); ++c)
00463 hit = X[r][c] > min(S.length(),T.length())/2;
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473 }
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539 struct ahyp
00540 {
00541 ushort s1,s2,e1,e2;
00542 float score;
00543 bool operator<(ahyp const& o) const { return score < o.score; }
00544 bool operator>(ahyp const& o) const { return score > o.score; }
00545 };
00546
00547 struct AlnPoint
00548 {
00549 enum status { no = 0, yes = 1, maybe = -1, undef = -7 };
00550 float score;
00551 status state;
00552 AlnPoint() : score(0), state(undef) {}
00553 };
00554
00555 bool overlap(span_t const& a, span_t const& b)
00556 {
00557 return !(a.second <= b.first || b.second <= a.first);
00558 }
00559
00560 class AlnMatrix
00561 {
00562 vector<bitvector> A1,A2;
00563 vector<bitvector> S1,S2;
00564 public:
00565 vector<bitvector*> m1,m2;
00566 AlnMatrix(size_t const rows, size_t const cols);
00567 bitvector const&
00568 operator[](size_t const r) const
00569 { return A1.at(r); }
00570
00571 bool
00572 incorporate(span_t const& rspan, span_t const& cspan,
00573 vector<uchar> const& aln, bool const flip);
00574
00575 size_t size() const { return A1.size(); }
00576 };
00577
00578 AlnMatrix::
00579 AlnMatrix(size_t const rows, size_t const cols)
00580 {
00581 A1.assign(rows,bitvector(cols));
00582 S1.assign(rows,bitvector(cols));
00583 A2.assign(cols,bitvector(rows));
00584 S2.assign(cols,bitvector(rows));
00585 m1.assign(rows,NULL);
00586 m2.assign(cols,NULL);
00587 }
00588
00589 bool
00590 AlnMatrix::
00591 incorporate(span_t const& rspan,
00592 span_t const& cspan,
00593 vector<uchar> const& aln,
00594 bool const flip)
00595 {
00596 for (size_t r = rspan.first; r < rspan.second; ++r)
00597 S1[r].reset();
00598 for (size_t c = cspan.first; c < cspan.second; ++c)
00599 S2[c].reset();
00600 if (flip)
00601 {
00602 for (size_t i = 0; i < aln.size(); i += 2)
00603 {
00604 size_t r = rspan.first + aln[i];
00605 size_t c = cspan.first + aln[i+1];
00606 S1[r].set(c);
00607 S2[c].set(r);
00608 }
00609 }
00610 else
00611 {
00612 for (size_t i = 0; i < aln.size(); i += 2)
00613 {
00614 size_t r = rspan.first + aln[i+1];
00615 size_t c = cspan.first + aln[i];
00616 S1[r].set(c);
00617 S2[c].set(r);
00618 }
00619 }
00620
00621 for (size_t r = rspan.first; r < rspan.second; ++r)
00622 if (m1[r] && (*m1[r]) != S1[r]) return false;
00623 for (size_t c = cspan.first; c < cspan.second; ++c)
00624 if (m2[c] && (*m2[c]) != S2[c]) return false;
00625
00626
00627 for (size_t r = rspan.first; r < rspan.second; ++r)
00628 if (!m1[r]) { A1[r] = S1[r]; m1[r] = &A1[r]; }
00629 for (size_t c = cspan.first; c < cspan.second; ++c)
00630 if (!m2[c]) { A2[c] = S2[c]; m2[c] = &A2[c]; }
00631
00632 return true;
00633 }
00634
00635 struct alink
00636 {
00637 size_t r,c,m;
00638 bool operator<(alink const& o) const { return m < o.m; }
00639 bool operator>(alink const& o) const { return m > o.m; }
00640 };
00641
00642 int main(int argc, char* argv[])
00643 {
00644 string base = argc > 1 ? argv[1] : "crp/trn/mm/";
00645 string L1 = argc > 1 ? argv[2] : "de";
00646 string L2 = argc > 1 ? argv[3] : "en";
00647 BT.open(base,L1,L2);
00648 BT.V1->setDynamic(true);
00649 BT.V2->setDynamic(true);
00650 string line1, line2;
00651 npmi_scorer1<Token> scorer;
00652 while (getline(cin,line1) and getline(cin,line2))
00653 {
00654 cout << "\n" << line1 << "\n" << line2 << endl;
00655 vector<Token> snt1,snt2;
00656 fill_token_seq(*BT.V1,line1,snt1);
00657 fill_token_seq(*BT.V2,line2,snt2);
00658 vector<vector<SPTR<vector<PhrasePair<Token> > > > > pt1,pt2;
00659 vector<vector<uint64_t> > pm1,pm2;
00660 BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer);
00661 BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer);
00662
00663
00664 typedef boost::unordered_map<uint64_t, vector<span_t> >
00665 p2s_map_t;
00666 typedef p2s_map_t::iterator p2s_iter;
00667 p2s_map_t p2s1,p2s2;
00668 for (ushort i = 0; i < pm1.size(); ++i)
00669 for (ushort k = 0; k < pm1[i].size(); ++k)
00670 p2s1[pm1[i][k]].push_back(make_pair(i,i+k+1));
00671 for (ushort i = 0; i < pm2.size(); ++i)
00672 for (ushort k = 0; k < pm2[i].size(); ++k)
00673 p2s2[pm2[i][k]].push_back(make_pair(i,i+k+1));
00674
00675 boost::unordered_map<uint64_t,SPTR<vector<PhrasePair<Token> > > > all1,all2;
00676 vector<PhrasePair<Token> > pp_all;
00677 for (size_t i = 0; i < pt2.size(); ++i)
00678 for (size_t k = 0; k < pt2[i].size(); ++k)
00679 all2[pm2[i][k]] = pt2[i][k];
00680 for (size_t i = 0; i < pt1.size(); ++i)
00681 for (size_t k = 0; k < pt1[i].size(); ++k)
00682 {
00683 all1[pm1[i][k]] = pt1[i][k];
00684 BOOST_FOREACH(PhrasePair<Token> const& pp, *pt1[i][k])
00685 {
00686 if (pp.score < 0) break;
00687 if (p2s2.find(pp.p2) != p2s2.end())
00688 pp_all.push_back(pp);
00689 }
00690 }
00691 sort(pp_all.begin(), pp_all.end(), greater<PhrasePair<Token> >());
00692 vector<int> a1(snt1.size(),-1), a2(snt2.size(),-1);
00693
00694 vector<bitvector> R(snt1.size(),bitvector(snt2.size()));
00695 vector<bitvector> C(snt2.size(),bitvector(snt1.size()));
00696 vector<bitvector> myR(snt1.size(),bitvector(snt2.size()));
00697 vector<bitvector> myC(snt2.size(),bitvector(snt1.size()));
00698 vector<bitvector*> m1(snt1.size(),NULL);
00699 vector<bitvector*> m2(snt2.size(),NULL);
00700
00701
00702 AlnMatrix A(snt1.size(),snt2.size());
00703 for (size_t p = 0; p < pp_all.size(); ++p)
00704 {
00705 PhrasePair<Token> const& pp = pp_all[p];
00706 #if 0
00707 cout << (boost::format("%30s ::: %-30s ")
00708 % BT.toString(pp.p1,0).c_str()
00709 % BT.toString(pp.p2,1).c_str());
00710 cout << (boost::format("%.4f [%d/%d/%d]")
00711 % pp.score % pp.good1 % pp.joint % pp.good2);
00712 for (size_t a = 0; a < pp.aln.size(); a += 2)
00713 cout << " " << int(pp.aln[a]) << "-" << int(pp.aln[a+1]);
00714 cout << endl;
00715 #endif
00716
00717 vector<span_t>& v1 = p2s1[pp.p1];
00718 vector<span_t>& v2 = p2s2[pp.p2];
00719 if (v1.size() == 1)
00720 for (size_t i = v1[0].first; i < v1[0].second; ++i)
00721 if (a1[i] < 0) a1[i] = p;
00722 if (v2.size() == 1)
00723 for (size_t i = v2[0].first; i < v2[0].second; ++i)
00724 if (a2[i] < 0) a2[i] = p;
00725
00726 if (v1.size() == 1 && v2.size() == 1)
00727 A.incorporate(v1[0],v2[0],pp.aln,pp.inverse);
00728 }
00729
00730 for (size_t i = 0; i < A.size(); ++i)
00731 {
00732 cout << (*BT.V1)[snt1[i].id()] << ": ";
00733 for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k))
00734 cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()];
00735 cout << endl;
00736 }
00737
00738
00739
00740 vector<PhrasePair<Token> > const* atrans, *btrans;
00741 ahyp h;
00742 vector<ahyp> hyps;
00743 vector<vector<int> > L(snt1.size(),vector<int>(snt2.size(),0));
00744
00745
00746 for (h.s1 = 0; h.s1 < a1.size(); ++h.s1)
00747 {
00748 if (a1[h.s1] >= 0) continue;
00749 ostringstream buf1;
00750 for (h.e1 = h.s1; h.e1 < a1.size() && a1[h.e1] < 0; ++h.e1)
00751 {
00752 if (h.e1 > h.s1)
00753 {
00754 if (pt1[h.s1].size() + h.s1 <= h.e1) break;
00755 buf1 << " ";
00756 }
00757 buf1 << (*BT.V1)[snt1[h.e1].id()];
00758 atrans = pt1[h.s1].size() ? pt1[h.s1].at(h.e1-h.s1).get() : NULL;
00759 for (h.s2 = 0; h.s2 < a2.size(); ++h.s2)
00760 {
00761 ostringstream buf2;
00762 if (a2[h.s2] >= 0) continue;
00763 for (h.e2 = h.s2; h.e2 < a2.size() && a2[h.e2] < 0; ++h.e2)
00764 {
00765 if (h.e2 > h.s2)
00766 {
00767 if (pt2[h.s2].size() + h.s2 <= h.e2) break;
00768 buf2 << " ";
00769 }
00770 buf2 << (*BT.V2)[snt2[h.e2].id()];
00771 btrans = (pt2[h.s2].size()
00772 ? pt2[h.s2].at(h.e2-h.s2).get()
00773 : NULL);
00774
00775 vector<vector<int> > aln;
00776 trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(),
00777 atrans,aln);
00778 for (size_t i = 0; i < aln.size(); ++i)
00779 for (size_t k = 0; k < aln[i].size(); ++k)
00780 L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[i][k]);
00781 trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(),
00782 btrans,aln);
00783 for (size_t i = 0; i < aln[0].size(); ++i)
00784 for (size_t k = 0; k < aln.size(); ++k)
00785 L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[k][i]);
00786
00787
00788 }
00789 }
00790 }
00791 }
00792
00793 vector<alink> links;
00794
00795 alink x;
00796 for (x.r = 0; x.r < L.size(); ++x.r)
00797 {
00798
00799 for (x.c = 0; x.c < L[x.r].size(); ++x.c)
00800 {
00801 x.m = L[x.r][x.c];
00802 if (x.m) links.push_back(x);
00803 }
00804 }
00805
00806 sort(links.begin(),links.end(),greater<alink>());
00807
00808 BOOST_FOREACH(alink& x, links)
00809 {
00810 if (L[x.r][x.c])
00811 {
00812 cout << (*BT.V1)[snt1[x.r].id()] << " ::: "
00813 << (*BT.V2)[snt2[x.c].id()] << " ::: "
00814 << L[x.r][x.c] << endl;
00815 }
00816 }
00817
00818
00819
00820
00821
00822
00823
00824
00825
00826
00827
00828
00829
00830 }
00831 }
00832
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854
00855
00856
00857
00858
00859
00860
00861
00862
00863
00864
00865
00866
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886