00001
00002
00003 #include "ug_bitext.h"
00004 #include <algorithm>
00005 #include <boost/math/distributions/binomial.hpp>
00006
00007 namespace sapt
00008 {
00009
00010 float
00011 lbop(size_t const tries, size_t const succ, float const confidence)
00012 {
00013 return (confidence == 0
00014 ? float(succ)/tries
00015 : (boost::math::binomial_distribution<>::
00016 find_lower_bound_on_p(tries, succ, confidence)));
00017 }
00018
00019 void
00020 snt_adder<L2R_Token<SimpleWordId> >::
00021 operator()()
00022 {
00023 typedef L2R_Token<SimpleWordId> tkn;
00024 std::vector<id_type> sids; sids.reserve(snt.size());
00025 BOOST_FOREACH(std::string const& foo, snt)
00026 {
00027 sids.push_back(track ? track->size() : 0);
00028 std::istringstream buf(foo);
00029 std::string w;
00030 std::vector<tkn> s; s.reserve(100);
00031 while (buf >> w) s.push_back(tkn(V[w]));
00032 track = append(track,s);
00033 }
00034 if (index)
00035 index.reset(new imTSA<tkn>(*index,track,sids,V.tsize()));
00036 else
00037 index.reset(new imTSA<tkn>(track,NULL,NULL));
00038 }
00039
00040 snt_adder<L2R_Token<SimpleWordId> >::
00041 snt_adder(std::vector<std::string> const& s, TokenIndex& v,
00042 SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t,
00043 SPTR<imTSA<L2R_Token<SimpleWordId> > >& i)
00044 : snt(s), V(v), track(t), index(i)
00045 { }
00046
00047 bool
00048 expand_phrase_pair
00049 (std::vector<std::vector<ushort> >& a1,
00050 std::vector<std::vector<ushort> >& a2,
00051 ushort const s2,
00052 ushort const L1, ushort const R1,
00053 ushort & s1, ushort & e1, ushort& e2)
00054 {
00055 if (a2[s2].size() == 0)
00056 {
00057 std::cout << __FILE__ << ":" << __LINE__ << std::endl;
00058 return false;
00059 }
00060 bitvector done1(a1.size());
00061 bitvector done2(a2.size());
00062 std::vector<std::pair<ushort,ushort> > agenda;
00063
00064
00065 agenda.reserve(a1.size() + a2.size());
00066 agenda.push_back(std::pair<ushort,ushort>(2,s2));
00067 e2 = s2;
00068 s1 = e1 = a2[s2].front();
00069 if (s1 >= L1 && s1 < R1)
00070 {
00071 std::cout << __FILE__ << ":" << __LINE__ << std::endl;
00072 return false;
00073 }
00074 agenda.push_back(std::pair<ushort,ushort>(2,s2));
00075 while (agenda.size())
00076 {
00077 ushort side = agenda.back().first;
00078 ushort p = agenda.back().second;
00079 agenda.pop_back();
00080 if (side == 1)
00081 {
00082 done1.set(p);
00083 BOOST_FOREACH(ushort i, a1[p])
00084 {
00085 if (i < s2)
00086 {
00087
00088 return false;
00089 }
00090 if (done2[i]) continue;
00091 for (;e2 <= i;++e2)
00092 if (!done2[e2])
00093 agenda.push_back(std::pair<ushort,ushort>(2,e2));
00094 }
00095 }
00096 else
00097 {
00098 done2.set(p);
00099 BOOST_FOREACH(ushort i, a2[p])
00100 {
00101 if ((e1 < L1 && i >= L1) ||
00102 (s1 >= R1 && i < R1) ||
00103 (i >= L1 && i < R1))
00104 {
00105
00106
00107
00108 return false;
00109 }
00110
00111 if (e1 < i)
00112 {
00113 for (; e1 <= i; ++e1)
00114 if (!done1[e1])
00115 agenda.push_back(std::pair<ushort,ushort>(1,e1));
00116 }
00117 else if (s1 > i)
00118 {
00119 for (; i <= s1; ++i)
00120 if (!done1[i])
00121 agenda.push_back(std::pair<ushort,ushort>(1,i));
00122 }
00123 }
00124 }
00125 }
00126 ++e1;
00127 ++e2;
00128 return true;
00129 }
00130
00131 void
00132 print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2,
00133 ushort b1, ushort e1, ushort b2, ushort e2)
00134 {
00135 using namespace std;
00136 std::vector<bitvector> M(a1.size(),bitvector(len2));
00137 for (ushort j = 0; j < a1.size(); ++j)
00138 {
00139 BOOST_FOREACH(ushort k, a1[j])
00140 M[j].set(k);
00141 }
00142 cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
00143 cout << " ";
00144 for (size_t c = 0; c < len2;++c)
00145 cout << c%10;
00146 cout << endl;
00147 for (size_t r = 0; r < M.size(); ++r)
00148 {
00149 cout << setw(3) << r << " ";
00150 for (size_t c = 0; c < M[r].size(); ++c)
00151 {
00152 if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
00153 cout << (M[r][c] ? 'x' : '-');
00154 else cout << (M[r][c] ? 'o' : '.');
00155 }
00156 cout << endl;
00157 }
00158 cout << std::string(90,'-') << endl;
00159 }
00160
00161 void
00162 write_bitvector(bitvector const& v, std::ostream& out)
00163 {
00164 for (size_t i = v.find_first(); i < v.size();)
00165 {
00166 out << i;
00167 if ((i = v.find_next(i)) < v.size()) out << ",";
00168 }
00169 }
00170
00171 }