00001
00002
00003 #if 0
00004 #include <stdint.h>
00005 #include <string>
00006 #include <vector>
00007 #include <cassert>
00008 #include <iomanip>
00009 #include <algorithm>
00010
00011 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00012 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00013 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00014
00015 #include <boost/math/distributions/binomial.hpp>
00016 #include <boost/unordered_map.hpp>
00017 #include <boost/foreach.hpp>
00018
00019 #include "ug_mm_ttrack.h"
00020 #include "ug_mm_tsa.h"
00021 #include "tpt_tokenindex.h"
00022 #include "ug_corpus_token.h"
00023 #include "ug_typedefs.h"
00024 #include "tpt_pickler.h"
00025 #include "ug_bitext.h"
00026 #include "ug_lexical_phrase_scorer2.h"
00027 #include "../sapt_phrase_scorers.h"
00028 using namespace std;
00029 using namespace ugdiss;
00030 using namespace Moses;
00031 using namespace Moses::bitext;
00032
00033 #define CACHING_THRESHOLD 1000
00034 #define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
00035 size_t mctr=0,xctr=0;
00036
00037 typedef L2R_Token<SimpleWordId> Token;
00038 typedef mmBitext<Token> mmbitext;
00039 mmbitext bt;
00040
00041
00042 float lbsmooth = .005;
00043
00044
00045 PScorePfwd<Token> calc_pfwd;
00046 PScorePbwd<Token> calc_pbwd;
00047 PScoreLex<Token> calc_lex(1.0);
00048 PScoreWC<Token> apply_wp;
00049 vector<float> fweights;
00050
00051 void
00052 nbest_phrasepairs(uint64_t const pid1,
00053 pstats const& ps,
00054 vector<PhrasePair> & nbest)
00055 {
00056 pstats::trg_map_t::const_iterator m;
00057 vector<size_t> idx(nbest.size());
00058 size_t i=0;
00059 for (m = ps.trg.begin();
00060 m != ps.trg.end() && i < nbest.size();
00061 ++m)
00062 {
00063
00064 if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
00065 continue;
00066 nbest[i].init(pid1,ps,5);
00067 nbest[i].update(m->first,m->second);
00068 calc_pfwd(bt, nbest[i]);
00069 calc_pbwd(bt, nbest[i]);
00070 calc_lex(bt, nbest[i]);
00071 apply_wp(bt, nbest[i]);
00072 nbest[i].eval(fweights);
00073 idx[i] = i;
00074 ++i;
00075 }
00076
00077 if (i < nbest.size())
00078 {
00079
00080 nbest.resize(i);
00081 idx.resize(i);
00082 }
00083 VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
00084 if (m != ps.trg.end())
00085 {
00086 make_heap(idx.begin(),idx.end(),sorter);
00087 PhrasePair cand;
00088 cand.init(pid1,ps,5);
00089 for (; m != ps.trg.end(); ++m)
00090 {
00091 if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
00092 continue;
00093 cand.update(m->first,m->second);
00094 calc_pfwd(bt, cand);
00095 calc_pbwd(bt, cand);
00096 calc_lex(bt, cand);
00097 apply_wp(bt, cand);
00098 cand.eval(fweights);
00099 if (cand < nbest[idx[0]]) continue;
00100 pop_heap(idx.begin(),idx.end(),sorter);
00101 nbest[idx.back()] = cand;
00102 push_heap(idx.begin(),idx.end(),sorter);
00103 }
00104 }
00105 sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
00106 }
00107
00108 int main(int argc, char* argv[])
00109 {
00110
00111 #if 0
00112 #if 0
00113 string base = argv[1];
00114 string L1 = argv[2];
00115 string L2 = argv[3];
00116 size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
00117 #else
00118 string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
00119 string L1 = "de";
00120 string L2 = "en";
00121 size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
00122 #endif
00123 char c = *base.rbegin();
00124 if (c != '/' && c != '.')
00125 base += ".";
00126
00127 fweights.resize(5,.25);
00128 fweights[0] = 1;
00129 bt.open(base,L1,L2);
00130 bt.setDefaultSampleSize(max_samples);
00131
00132 size_t i;
00133 i = calc_pfwd.init(0,.05,'g');
00134 i = calc_pbwd.init(i,.05,'g');
00135 i = calc_lex.init(i,base+L1+"-"+L2+".lex");
00136 i = apply_wp.init(i);
00137
00138 string line;
00139 while (getline(cin,line))
00140 {
00141 vector<id_type> snt;
00142 bt.V1->fillIdSeq(line,snt);
00143 for (size_t i = 0; i < snt.size(); ++i)
00144 {
00145 TSA<Token>::tree_iterator m(bt.I1.get());
00146 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
00147 bt.prep(m);
00148 }
00149
00150 for (size_t i = 0; i < snt.size(); ++i)
00151 {
00152 TSA<Token>::tree_iterator m(bt.I1.get());
00153 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
00154 {
00155 uint64_t spid = m.getPid();
00156 SPTR<pstats> s = bt.lookup(m);
00157 for (size_t j = i; j <= k; ++j)
00158 cout << (*bt.V1)[snt[j]] << " ";
00159 cout << s->good << "/"
00160 << s->sample_cnt << "/"
00161 << s->raw_cnt << endl;
00162
00163 vector<PhrasePair> nbest(s->trg.size());
00164 nbest_phrasepairs(spid, *s, nbest);
00165 BOOST_FOREACH(PhrasePair const& pp, nbest)
00166 {
00167 uint32_t sid,off,len;
00168 parse_pid(pp.p2,sid,off,len);
00169 uint32_t stop = off + len;
00170
00171 Token const* o = bt.T2->sntStart(sid);
00172 cout << " " << setw(6) << pp.score << " ";
00173 for (uint32_t i = off; i < stop; ++i)
00174 cout << (*bt.V2)[o[i].id()] << " ";
00175 cout << pp.joint << "/"
00176 << pp.raw1 << "/"
00177 << pp.raw2 << " |";
00178 BOOST_FOREACH(float f, pp.fvals)
00179 cout << " " << f;
00180 cout << endl;
00181 }
00182 }
00183 }
00184 }
00185 #endif
00186 exit(0);
00187 }
00188 #endif