00001
00002
00003
00004
00005 #ifndef __ug_lexical_phrase_scorer_h
00006 #define __ug_lexical_phrase_scorer_h
00007
00008 #include "ug_stream.h"
00009 #include "tpt_tokenindex.h"
00010 #include <string>
00011 #include <boost/unordered_map.hpp>
00012 #include "tpt_pickler.h"
00013
00014 namespace ugdiss
00015 {
00016
00017 template<typename TKN>
00018 class
00019 LexicalPhraseScorer1
00020 {
00021 typedef boost::unordered_map<id_type, float> inner_map_t;
00022 std::vector<inner_map_t> L1_given_L2;
00023 std::vector<inner_map_t> L2_given_L1;
00024 void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
00025 std::vector<inner_map_t> & lex);
00026 public:
00027 void open(string const& bname, string const& L1, string const& L2,
00028 TokenIndex & V1, TokenIndex & V2);
00029 void score(TKN const* snt1, size_t const s1, size_t const e1,
00030 TKN const* snt2, size_t const s2, size_t const e2,
00031 std::vector<ushort> aln, float & fwd_score, float& bwd_score);
00032 void score(TKN const* snt1, size_t const s1, size_t const e1,
00033 TKN const* snt2, size_t const s2, size_t const e2,
00034 char const* const aln_start, char const* const aln_end,
00035 float & fwd_score, float& bwd_score);
00036 float permissive_lookup(vector<inner_map_t> const& lex,
00037 id_type const s, id_type const t) const;
00038 };
00039
00040 template<typename TKN>
00041 void
00042 LexicalPhraseScorer1<TKN>::
00043 load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2,
00044 std::vector<inner_map_t> & lex)
00045 {
00046 boost::iostreams::filtering_istream in;
00047 cout << fname << std::endl;
00048 open_input_stream(fname,in);
00049 lex.resize(V1.ksize());
00050 string w1,w2; float p;
00051 while (in >> w1 >> w2 >> p)
00052 {
00053 id_type id1 = V1[w1];
00054 while (lex.size() <= id1)
00055 lex.push_back(inner_map_t());
00056 lex[id1][V2[w2]] = p;
00057 }
00058 }
00059
00060 template<typename TKN>
00061 void
00062 LexicalPhraseScorer1<TKN>::
00063 open(string const& bname, string const& L1, string const& L2,
00064 TokenIndex & V1, TokenIndex & V2)
00065 {
00066 string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz";
00067 string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz";
00068 cout << lex1 << std::endl;
00069 cout << lex2 << std::endl;
00070 load_lex(lex1,V1,V2,L1_given_L2);
00071 load_lex(lex2,V2,V1,L2_given_L1);
00072 }
00073
00074 template<typename TKN>
00075 void
00076 LexicalPhraseScorer1<TKN>::
00077 score(TKN const* snt1, size_t const s1, size_t const e1,
00078 TKN const* snt2, size_t const s2, size_t const e2,
00079 vector<ushort> aln, float & fwd_score, float& bwd_score)
00080 {
00081 std::vector<float> p1(e1,0), p2(e2,0);
00082 std::vector<int> c1(e1,0), c2(e2,0);
00083 size_t i1=0,i2=0;
00084 for (size_t k = 0; k < aln.size(); ++k)
00085 {
00086 i1 = aln[k]; i2 = aln[++k];
00087 if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
00088 p1[i1] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
00089 ++c1[i1];
00090 p2[i2] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
00091 ++c2[i2];
00092 }
00093 fwd_score = 0;
00094 for (size_t i = s1; i < e1; ++i)
00095 {
00096 if (c1[i] == 1) fwd_score += log(p1[i]);
00097 else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
00098 else fwd_score += log(L1_given_L2[snt1[i].id()][0]);
00099 }
00100 bwd_score = 0;
00101 for (size_t i = s2; i < e2; ++i)
00102 {
00103 if (c2[i] == 1) bwd_score += log(p2[i]);
00104 else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
00105 else bwd_score += log(L2_given_L1[snt2[i].id()][0]);
00106 }
00107 }
00108
00109 template<typename TKN>
00110 float
00111 LexicalPhraseScorer1<TKN>::
00112 permissive_lookup(vector<inner_map_t> const& lex,
00113 id_type const s, id_type const t) const
00114 {
00115 if (s >= lex.size()) return 1.0;
00116 inner_map_t::const_iterator m = lex[s].find(t);
00117 return m == lex[s].end() ? 1.0 : m->second;
00118 }
00119
00120 template<typename TKN>
00121 void
00122 LexicalPhraseScorer1<TKN>::
00123 score(TKN const* snt1, size_t const s1, size_t const e1,
00124 TKN const* snt2, size_t const s2, size_t const e2,
00125 char const* const aln_start, char const* const aln_end,
00126 float & fwd_score, float& bwd_score)
00127 {
00128 std::vector<float> p1(e1,0), p2(e2,0);
00129 std::vector<int> c1(e1,0), c2(e2,0);
00130 size_t i1=0,i2=0;
00131 for (char const* x = aln_start; x < aln_end;)
00132 {
00133 x = binread(binread(x,i1),i2);
00134
00135
00136 if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue;
00137 p1[i1] += permissive_lookup(L1_given_L2, snt1[i1].id(), snt2[i2].id());
00138 ++c1[i1];
00139 p2[i2] += permissive_lookup(L2_given_L1, snt2[i2].id(), snt1[i1].id());
00140 ++c2[i2];
00141 }
00142 fwd_score = 0;
00143 for (size_t i = s1; i < e1; ++i)
00144 {
00145 if (c1[i] == 1) fwd_score += log(p1[i]);
00146 else if (c1[i]) fwd_score += log(p1[i])-log(c1[i]);
00147 else fwd_score += log(L1_given_L2[snt1[i].id()][0]);
00148 }
00149 bwd_score = 0;
00150 for (size_t i = s2; i < e2; ++i)
00151 {
00152 if (c2[i] == 1) bwd_score += log(p2[i]);
00153 else if (c2[i]) bwd_score += log(p2[i])-log(c2[i]);
00154 else bwd_score += log(L2_given_L1[snt2[i].id()][0]);
00155 }
00156 }
00157 }
00158 #endif