00001
00002 #pragma once
00003 #include <vector>
00004 #include "ug_typedefs.h"
00005 #include "ug_bitext_pstats.h"
00006 #ifndef NO_MOSES
00007 #include "moses/FF/LexicalReordering/LRState.h"
00008 #endif
00009 #include "boost/format.hpp"
00010 #include "tpt_tokenindex.h"
00011
00012 namespace sapt
00013 {
00014
00015 template<typename Token>
00016 class
00017 PhrasePair
00018 {
00019 public:
00020 class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
00021 Token const* start1;
00022 Token const* start2;
00023 uint32_t len1;
00024 uint32_t len2;
00025 uint64_t p1, p2;
00026 uint32_t raw1, raw2, sample1, sample2, good1, good2, joint;
00027 float cum_bias;
00028 std::vector<float> fvals;
00029 float dfwd[LRModel::NONE+1];
00030 float dbwd[LRModel::NONE+1];
00031 std::vector<unsigned char> aln;
00032 float score;
00033 bool inverse;
00034 SPTR<std::vector<uint32_t> > sids;
00035
00036
00037 std::map<uint32_t,uint32_t> indoc;
00038 PhrasePair() { };
00039 PhrasePair(PhrasePair const& o);
00040
00041 PhrasePair const& operator+=(PhrasePair const& other);
00042
00043 bool operator<(PhrasePair const& other) const;
00044 bool operator>(PhrasePair const& other) const;
00045 bool operator<=(PhrasePair const& other) const;
00046 bool operator>=(PhrasePair const& other) const;
00047
00048 void init();
00049 void init(uint64_t const pid1, bool is_inverse,
00050 Token const* x, uint32_t const len,
00051 pstats const* ps = NULL, size_t const numfeats=0);
00052
00053 PhrasePair const&
00054 update(uint64_t const pid2, Token const* x,
00055 uint32_t const len, jstats const& js);
00056
00057 void
00058 fill_lr_vec(LRModel::Direction const& dir,
00059 LRModel::ModelType const& mdl,
00060 std::vector<float>& v) const;
00061 #ifndef NO_MOSES
00062 void
00063 print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2,
00064 LRModel const& LR) const;
00065 #endif
00066
00067 class SortByTargetIdSeq
00068 {
00069 public:
00070 int cmp(PhrasePair const& a, PhrasePair const& b) const;
00071 bool operator()(PhrasePair const& a, PhrasePair const& b) const;
00072 };
00073
00074 class SortDescendingByJointCount
00075 {
00076 public:
00077 int cmp(PhrasePair const& a, PhrasePair const& b) const;
00078 bool operator()(PhrasePair const& a, PhrasePair const& b) const;
00079 };
00080 };
00081
00082 template<typename Token>
00083 void PhrasePair<Token>
00084 ::init(uint64_t const pid1, bool is_inverse,
00085 Token const* x, uint32_t const len,
00086 pstats const* ps, size_t const numfeats)
00087 {
00088 inverse = is_inverse;
00089 start1 = x; len1 = len;
00090 p1 = pid1;
00091 p2 = 0;
00092 if (ps)
00093 {
00094 raw1 = ps->raw_cnt;
00095 sample1 = ps->sample_cnt;
00096 good1 = ps->good;
00097 }
00098 else raw1 = sample1 = good1 = 0;
00099 joint = 0;
00100 good2 = 0;
00101 sample2 = 0;
00102 raw2 = 0;
00103 cum_bias = 0;
00104 fvals.resize(numfeats);
00105 }
00106
00107 template<typename Token>
00108 PhrasePair<Token> const&
00109 PhrasePair<Token>
00110 ::update(uint64_t const pid2,
00111 Token const* x, uint32_t const len, jstats const& js)
00112 {
00113 p2 = pid2;
00114 start2 = x; len2 = len;
00115 raw2 = js.cnt2();
00116 joint = js.rcnt();
00117 cum_bias = js.bcnt();
00118 assert(js.aln().size());
00119 if (js.aln().size())
00120 aln = js.aln()[0].second;
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130 for (int i = 0; i <= LRModel::NONE; i++)
00131 {
00132 PhraseOrientation po = static_cast<PhraseOrientation>(i);
00133 dfwd[i] = js.dcnt_fwd(po);
00134 dbwd[i] = js.dcnt_bwd(po);
00135 }
00136
00137 sids = js.sids;
00138 indoc = js.indoc;
00139 return *this;
00140 }
00141
00142 template<typename Token>
00143 bool
00144 PhrasePair<Token>
00145 ::operator<(PhrasePair const& other) const
00146 {
00147 return this->score < other.score;
00148 }
00149
00150 template<typename Token>
00151 bool
00152 PhrasePair<Token>
00153 ::operator>(PhrasePair const& other) const
00154 {
00155 return this->score > other.score;
00156 }
00157
00158 template<typename Token>
00159 bool
00160 PhrasePair<Token>
00161 ::operator<=(PhrasePair const& other) const
00162 {
00163 return this->score <= other.score;
00164 }
00165
00166 template<typename Token>
00167 bool
00168 PhrasePair<Token>
00169 ::operator>=(PhrasePair const& other) const
00170 {
00171 return this->score >= other.score;
00172 }
00173
00174 template<typename Token>
00175 PhrasePair<Token> const&
00176 PhrasePair<Token>
00177 ::operator+=(PhrasePair const& o)
00178 {
00179 raw1 += o.raw1;
00180 raw2 += o.raw2;
00181 good1 += o.good1;
00182 good2 += o.good2;
00183 joint += o.joint;
00184 sample1 += o.sample1;
00185 sample2 += o.sample2;
00186 cum_bias += o.cum_bias;
00187
00188 if (sids && o.sids)
00189 sids->insert(sids->end(), o.sids->begin(), o.sids->end());
00190 return *this;
00191 }
00192
00193 template<typename Token>
00194 PhrasePair<Token>
00195 ::PhrasePair(PhrasePair<Token> const& o)
00196 : start1(o.start1) , start2(o.start2)
00197 , len1(o.len1) , len2(o.len2)
00198 , p1(o.p1) , p2(o.p2)
00199 , raw1(o.raw1) , raw2(o.raw2)
00200 , sample1(o.sample1) , sample2(o.sample2)
00201 , good1(o.good1) , good2(o.good2)
00202 , joint(o.joint) , cum_bias(o.cum_bias)
00203 , fvals(o.fvals)
00204 , aln(o.aln)
00205 , score(o.score)
00206 , inverse(o.inverse)
00207 , sids(o.sids)
00208 , indoc(o.indoc)
00209 {
00210 for (int i = 0; i <= LRModel::NONE; ++i)
00211 {
00212 dfwd[i] = o.dfwd[i];
00213 dbwd[i] = o.dbwd[i];
00214 }
00215 }
00216
00217 template<typename Token>
00218 int PhrasePair<Token>
00219 ::SortByTargetIdSeq
00220 ::cmp(PhrasePair const& a, PhrasePair const& b) const
00221 {
00222 size_t i = 0;
00223 Token const* x = a.start2;
00224 Token const* y = b.start2;
00225 while (i < a.len2 && i < b.len2 && x->id() == y->id())
00226 {
00227 x = x->next();
00228 y = y->next();
00229 ++i;
00230 }
00231 if (i == a.len2 && i == b.len2) return 0;
00232 if (i == a.len2) return -1;
00233 if (i == b.len2) return 1;
00234 return x->id() < y->id() ? -1 : 1;
00235 }
00236
00237 template<typename Token>
00238 bool PhrasePair<Token>
00239 ::SortByTargetIdSeq
00240 ::operator()(PhrasePair const& a, PhrasePair const& b) const
00241 {
00242 return this->cmp(a,b) < 0;
00243 }
00244
00245 template<typename Token>
00246 int PhrasePair<Token>
00247 ::SortDescendingByJointCount
00248 ::cmp(PhrasePair const& a, PhrasePair const& b) const
00249 {
00250 if (a.joint == b.joint) return 0;
00251 return a.joint > b.joint ? -1 : 1;
00252 }
00253
00254 template<typename Token>
00255 bool
00256 PhrasePair<Token>
00257 ::SortDescendingByJointCount
00258 ::operator()(PhrasePair const& a, PhrasePair const& b) const
00259 {
00260 return this->cmp(a,b) < 0;
00261 }
00262
00263 template<typename Token>
00264 void
00265 PhrasePair<Token>
00266 ::init()
00267 {
00268 inverse = false;
00269 len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
00270 start1 = start2 = NULL;
00271 p1 = p2 = 0;
00272 }
00273
00274
00275 void
00276 fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt,
00277 float const total, float* v);
00278
00279 template<typename Token>
00280 void
00281 PhrasePair<Token>
00282 ::fill_lr_vec(LRModel::Direction const& dir,
00283 LRModel::ModelType const& mdl,
00284 std::vector<float>& v) const
00285 {
00286
00287 size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2);
00288 size_t offset;
00289 if (dir == LRModel::Bidirectional)
00290 {
00291 offset = num_scores;
00292 num_scores *= 2;
00293 }
00294 else offset = 0;
00295
00296 v.resize(num_scores);
00297
00298
00299 float total = 0;
00300 for (size_t i = 0; i <= LRModel::NONE; ++i)
00301 total += dfwd[i];
00302
00303 if (dir != LRModel::Forward)
00304 fill_lr_vec2(mdl, dbwd, total, &v[0]);
00305 if (dir != LRModel::Backward)
00306 fill_lr_vec2(mdl, dfwd, total, &v[offset]);
00307 }
00308
00309
00310 #ifndef NO_MOSES
00311 template<typename Token>
00312 void
00313 PhrasePair<Token>
00314 ::print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2,
00315 LRModel const& LR) const
00316 {
00317 out << toString (V1, this->start1, this->len1) << " ::: "
00318 << toString (V2, this->start2, this->len2) << " "
00319 << this->joint << " [";
00320
00321 for (std::map<uint32_t,uint32_t>::const_iterator m = indoc.begin();
00322 m != indoc.end(); ++m)
00323 {
00324 if (m != indoc.begin()) out << " ";
00325 out << m->first << ":" << m->second;
00326 }
00327 out << "] [";
00328 std::vector<float> lrscores;
00329 this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores);
00330 for (size_t i = 0; i < lrscores.size(); ++i)
00331 {
00332 if (i) out << " ";
00333 out << boost::format("%.2f") % exp(lrscores[i]);
00334 }
00335 out << "]" << std::endl;
00336 #if 0
00337 for (int i = 0; i <= Moses::LRModel::NONE; i++)
00338 {
00339
00340 if (i) *log << " ";
00341 *log << p.dfwd[i];
00342 }
00343 *log << "] [";
00344 for (int i = 0; i <= Moses::LRModel::NONE; i++)
00345 {
00346
00347 if (i) *log << " ";
00348 *log << p.dbwd[i];
00349 }
00350 #endif
00351 }
00352 #endif
00353 }