00001
00002
00003
00004
00005
00006
00007 #ifndef __ug_lexical_phrase_scorer_h
00008 #define __ug_lexical_phrase_scorer_h
00009
00010 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00011 #include "tpt_tokenindex.h"
00012 #include <string>
00013 #include <boost/unordered_map.hpp>
00014 #include "tpt_pickler.h"
00015 #include "ug_mm_2d_table.h"
00016 #include "util/exception.hh"
00017
00018 namespace sapt
00019 {
00020
00021 template<typename TKN>
00022 class
00023 LexicalPhraseScorer2
00024 {
00025 std::vector<std::string> ftag;
00026 public:
00027 typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
00028 table_t COOC;
00029 void open(std::string const& fname);
00030 template<typename some_int>
00031 void
00032 score(TKN const* snt1, size_t const s1, size_t const e1,
00033 TKN const* snt2, size_t const s2, size_t const e2,
00034 std::vector<some_int> const & aln, float const alpha,
00035 float & fwd_score, float& bwd_score) const;
00036
00037 void
00038 score(TKN const* snt1, size_t const s1, size_t const e1,
00039 TKN const* snt2, size_t const s2, size_t const e2,
00040 char const* const aln_start, char const* const aln_end,
00041 float const alpha, float & fwd_score, float& bwd_score) const;
00042
00043
00044 float plup_fwd(id_type const s,id_type const t, float const alpha) const;
00045 float plup_bwd(id_type const s,id_type const t, float const alpha) const;
00046
00047
00048
00049
00050
00051 };
00052
00053 template<typename TKN>
00054 void
00055 LexicalPhraseScorer2<TKN>::
00056 open(std::string const& fname)
00057 {
00058 COOC.open(fname);
00059 }
00060
00061 template<typename TKN>
00062 template<typename some_int>
00063 void
00064 LexicalPhraseScorer2<TKN>::
00065 score(TKN const* snt1, size_t const s1, size_t const e1,
00066 TKN const* snt2, size_t const s2, size_t const e2,
00067 std::vector<some_int> const & aln, float const alpha,
00068 float & fwd_score, float& bwd_score) const
00069 {
00070 std::vector<float> p1(e1,0), p2(e2,0);
00071 std::vector<int> c1(e1,0), c2(e2,0);
00072 size_t i1=0,i2=0;
00073 for (size_t k = 0; k < aln.size(); ++k)
00074 {
00075 i1 = aln[k]; i2 = aln[++k];
00076 if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
00077 p1[i1] += plup_fwd(snt1[i1].id(),snt2[i2].id(),alpha);
00078 ++c1[i1];
00079 p2[i2] += plup_bwd(snt1[i1].id(),snt2[i2].id(),alpha);
00080 ++c2[i2];
00081 }
00082 fwd_score = 0;
00083 for (size_t i = s1; i < e1; ++i)
00084 {
00085 if (c1[i] == 1) fwd_score += log(p1[i]);
00086 else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
00087 else fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
00088 }
00089 bwd_score = 0;
00090 for (size_t i = s2; i < e2; ++i)
00091 {
00092 if (c2[i] == 1) bwd_score += log(p2[i]);
00093 else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
00094 else bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
00095 }
00096 }
00097
00098 template<typename TKN>
00099 float
00100 LexicalPhraseScorer2<TKN>::
00101 plup_fwd(id_type const s, id_type const t, float const alpha) const
00102 {
00103 if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
00104 UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
00105 << ": alpha parameter must be >= 0");
00106 float ret = COOC[s][t]+alpha;
00107 ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
00108 UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
00109 << ": result not > 0 and <= 1. alpha = " << alpha << "; "
00110 << COOC[s][t] << "/" << COOC.m1(s));
00111
00112 #if 0
00113 cerr << "[" << s << "," << t << "] "
00114 << COOC.m1(s) << "/"
00115 << COOC[s][t] << "/"
00116 << COOC.m2(t) << std::endl;
00117 #endif
00118 return ret;
00119 }
00120
00121 template<typename TKN>
00122 float
00123 LexicalPhraseScorer2<TKN>::
00124 plup_bwd(id_type const s, id_type const t,float const alpha) const
00125 {
00126 if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
00127 UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
00128 << ": alpha parameter must be >= 0");
00129 float ret = float(COOC[s][t]+alpha);
00130 ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
00131 UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
00132 << ": result not > 0 and <= 1.");
00133 return ret;
00134 }
00135
00136 template<typename TKN>
00137 void
00138 LexicalPhraseScorer2<TKN>::
00139 score(TKN const* snt1, size_t const s1, size_t const e1,
00140 TKN const* snt2, size_t const s2, size_t const e2,
00141 char const* const aln_start, char const* const aln_end,
00142 float const alpha, float & fwd_score, float& bwd_score) const
00143 {
00144 std::vector<float> p1(e1,0), p2(e2,0);
00145 std::vector<int> c1(e1,0), c2(e2,0);
00146 size_t i1=0,i2=0;
00147 for (char const* x = aln_start; x < aln_end;)
00148 {
00149 x = tpt::binread(tpt::binread(x,i1),i2);
00150 if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
00151 p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha);
00152 ++c1[i1];
00153 p2[i2] += plup_bwd(snt1[i1].id(), snt2[i2].id(),alpha);
00154 ++c2[i2];
00155 }
00156 fwd_score = 0;
00157 for (size_t i = s1; i < e1; ++i)
00158 {
00159 if (c1[i] == 1) fwd_score += log(p1[i]);
00160 else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
00161 else fwd_score += log(plup_fwd(snt1[i].id(),0,alpha));
00162 }
00163 bwd_score = 0;
00164 for (size_t i = s2; i < e2; ++i)
00165 {
00166 if (c2[i] == 1) bwd_score += log(p2[i]);
00167 else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
00168 else bwd_score += log(plup_bwd(0,snt2[i].id(),alpha));
00169 }
00170 }
00171 }
00172 #endif