00001
00002 #pragma once
00003
00004 #include <algorithm>
00005
00006 #include <boost/random.hpp>
00007 #include <boost/thread.hpp>
00008 #include <boost/thread/locks.hpp>
00009 #include <boost/intrusive_ptr.hpp>
00010 #include <boost/math/distributions/binomial.hpp>
00011
00012 #include "ug_bitext.h"
00013 #include "ug_bitext_pstats.h"
00014 #include "ug_sampling_bias.h"
00015 #include "ug_tsa_array_entry.h"
00016 #include "ug_bitext_phrase_extraction_record.h"
00017 #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
00018 #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
00019 #include "moses/TranslationModel/UG/generic/sorting/NBestList.h"
00020 namespace sapt
00021 {
00022
00023 enum
00024 sampling_method
00025 {
00026 full_coverage,
00027 random_sampling,
00028 ranked_sampling,
00029 ranked_sampling2
00030 };
00031
00032 typedef ttrack::Position TokenPosition;
00033 class CandidateSorter
00034 {
00035 SamplingBias const& score;
00036 public:
00037 CandidateSorter(SamplingBias const& s) : score(s) {}
00038 bool operator()(TokenPosition const& a, TokenPosition const& b) const
00039 { return score[a.sid] > score[b.sid]; }
00040 };
00041
00042 template<typename Token>
00043 class
00044 BitextSampler : public Moses::reference_counter
00045 {
00046 typedef Bitext<Token> bitext;
00047 typedef TSA<Token> tsa;
00048 typedef SamplingBias bias_t;
00049 typedef typename Bitext<Token>::iter tsa_iter;
00050 mutable boost::condition_variable m_ready;
00051 mutable boost::mutex m_lock;
00052
00053
00054
00055 SPTR<bitext const> const m_bitext;
00056 size_t const m_plen;
00057 bool const m_fwd;
00058 SPTR<tsa const> const m_root;
00059 char const* m_next;
00060 char const* m_stop;
00061 sampling_method const m_method;
00062 SPTR<bias_t const> const m_bias;
00063 size_t const m_samples;
00064 size_t const m_min_samples;
00065
00066 SPTR<pstats> m_stats;
00067 size_t m_ctr;
00068 float m_total_bias;
00069 bool m_finished;
00070 size_t m_num_occurrences;
00071 boost::taus88 m_rnd;
00072 double m_bias_total;
00073 bool m_track_sids;
00074
00075 size_t consider_sample(TokenPosition const& p);
00076 size_t perform_random_sampling();
00077 size_t perform_full_phrase_extraction();
00078
00079 int check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
00080 bool flip_coin(id_type const& sid, ushort const& offset, SamplingBias const* bias);
00081
00082 public:
00083 BitextSampler(BitextSampler const& other);
00084
00085 BitextSampler(SPTR<bitext const> const& bitext,
00086 typename bitext::iter const& phrase,
00087 SPTR<SamplingBias const> const& bias,
00088 size_t const min_samples,
00089 size_t const max_samples,
00090 sampling_method const method,
00091 bool const track_sids);
00092 ~BitextSampler();
00093 SPTR<pstats> stats();
00094 bool done() const;
00095 #ifdef MMT
00096 #include "mmt_bitext_sampler-inc.h"
00097 #else
00098 bool operator()();
00099 #endif
00100 };
00101
00102 template<typename Token>
00103 int
00104 BitextSampler<Token>::
00105 check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
00106 {
00107
00108
00109
00110
00111 typedef boost::math::binomial_distribution<> binomial;
00112
00113
00114 std::ostream* log = NULL;
00115
00116 if (!m_bias) return 1;
00117
00118 float p = (*m_bias)[sid];
00119 id_type docid = m_bias->GetClass(sid);
00120
00121 pstats::indoc_map_t::const_iterator m = m_stats->indoc.find(docid);
00122 uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ;
00123
00124
00125
00126 bool ret = (p > .5 || k == 0);
00127
00128 if (ret && !log) return 1;
00129
00130 uint32_t N = m_stats->good;
00131 float d = cdf(complement(binomial(N, p), k));
00132
00133 ret = ret || d >= .05;
00134
00135 #if 0
00136 if (log)
00137 {
00138 Token const* t = m_root->getCorpus()->sntStart(sid)+offset;
00139 Token const* x = t - min(offset,uint64_t(3));
00140 Token const* e = t + 4;
00141 if (e > m_root->getCorpus()->sntEnd(sid))
00142 e = m_root->getCorpus()->sntEnd(sid);
00143 *log << docid << ":" << sid << " " << size_t(k) << "/" << N
00144 << " @" << p << " => " << d << " [";
00145 pstats::indoc_map_t::const_iterator m;
00146 for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m)
00147 {
00148 if (m != m_stats->indoc.begin()) *log << " ";
00149 *log << m->first << ":" << m->second;
00150 }
00151 *log << "] ";
00152 for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
00153 if (!ret) *log << "SKIP";
00154 else if (p < .5 && d > .9) *log << "FORCE";
00155 *log << std::endl;
00156 }
00157 #endif
00158 return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0);
00159 }
00160
00161 template<typename Token>
00162 bool
00163 BitextSampler<Token>::
00164 flip_coin(id_type const& sid, ushort const& offset, bias_t const* bias)
00165 {
00166 int no_maybe_yes = bias ? check_sample_distribution(sid, offset) : 1;
00167 if (no_maybe_yes == 0) return false;
00168 if (no_maybe_yes > 1) return true;
00169
00170 size_t options_chosen = m_stats->good;
00171 size_t options_total = std::max(m_stats->raw_cnt, m_ctr);
00172 size_t options_left = (options_total - m_ctr);
00173 size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.));
00174 size_t threshold;
00175 if (bias && m_bias_total > 0)
00176 threshold = ((*bias)[sid]/m_bias_total * options_total * m_samples);
00177 else
00178 threshold = m_samples;
00179 return random_number + options_chosen < threshold;
00180 }
00181
00182
00183
00184
00185 template<typename Token>
00186 BitextSampler<Token>::
00187 BitextSampler(SPTR<Bitext<Token> const> const& bitext,
00188 typename bitext::iter const& phrase,
00189 SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
00190 sampling_method const method, bool const track_sids)
00191 : m_bitext(bitext)
00192 , m_plen(phrase.size())
00193 , m_fwd(phrase.root == bitext->I1.get())
00194 , m_root(m_fwd ? bitext->I1 : bitext->I2)
00195 , m_next(phrase.lower_bound(-1))
00196 , m_stop(phrase.upper_bound(-1))
00197 , m_method(method)
00198 , m_bias(bias)
00199 , m_samples(max_samples)
00200 , m_min_samples(min_samples)
00201 , m_ctr(0)
00202 , m_total_bias(0)
00203 , m_finished(false)
00204 , m_num_occurrences(phrase.ca())
00205 , m_rnd(0)
00206 , m_track_sids(track_sids)
00207 {
00208 m_stats.reset(new pstats(m_track_sids));
00209 m_stats->raw_cnt = phrase.ca();
00210 m_stats->register_worker();
00211 }
00212
00213 template<typename Token>
00214 BitextSampler<Token>::
00215 BitextSampler(BitextSampler const& other)
00216 : m_bitext(other.m_bitext)
00217 , m_plen(other.m_plen)
00218 , m_fwd(other.m_fwd)
00219 , m_root(other.m_root)
00220 , m_next(other.m_next)
00221 , m_stop(other.m_stop)
00222 , m_method(other.m_method)
00223 , m_bias(other.m_bias)
00224 , m_samples(other.m_samples)
00225 , m_min_samples(other.m_min_samples)
00226 , m_num_occurrences(other.m_num_occurrences)
00227 , m_rnd(0)
00228 {
00229
00230 boost::unique_lock<boost::mutex> mylock(m_lock);
00231 boost::unique_lock<boost::mutex> yrlock(other.m_lock);
00232
00233 m_stats = other.m_stats;
00234 m_stats->register_worker();
00235 m_ctr = other.m_ctr;
00236 m_total_bias = other.m_total_bias;
00237 m_finished = other.m_finished;
00238 }
00239
00240
00241 template<typename Token>
00242 size_t
00243 BitextSampler<Token>::
00244 perform_full_phrase_extraction()
00245 {
00246 if (m_next == m_stop) return m_ctr;
00247 for (sapt::tsa::ArrayEntry I(m_next); I.next < m_stop; ++m_ctr)
00248 {
00249 ++m_ctr;
00250 m_root->readEntry(I.next, I);
00251 consider_sample(I);
00252 }
00253 return m_ctr;
00254 }
00255
00256
00257
00258 template<typename Token>
00259 size_t
00260 BitextSampler<Token>::
00261 perform_random_sampling()
00262 {
00263 if (m_next == m_stop) return m_ctr;
00264 m_bias_total = 0;
00265 sapt::tsa::ArrayEntry I(m_next);
00266 if (m_bias)
00267 {
00268 m_stats->raw_cnt = 0;
00269 while (I.next < m_stop)
00270 {
00271 m_root->readEntry(I.next,I);
00272 ++m_stats->raw_cnt;
00273 m_bias_total += (*m_bias)[I.sid];
00274 }
00275 I.next = m_next;
00276 }
00277
00278 while (m_stats->good < m_samples && I.next < m_stop)
00279 {
00280 ++m_ctr;
00281 m_root->readEntry(I.next,I);
00282 if (!flip_coin(I.sid, I.offset, m_bias.get())) continue;
00283 consider_sample(I);
00284 }
00285 return m_ctr;
00286 }
00287
00288 template<typename Token>
00289 size_t
00290 BitextSampler<Token>::
00291 consider_sample(TokenPosition const& p)
00292 {
00293 std::vector<unsigned char> aln;
00294 bitvector full_aln(100*100);
00295 PhraseExtractionRecord
00296 rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln);
00297 int docid = m_bias ? m_bias->GetClass(p.sid) : m_bitext->sid2did(p.sid);
00298 if (!m_bitext->find_trg_phr_bounds(rec))
00299 {
00300 m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
00301 return 0;
00302 }
00303
00304
00305 size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1);
00306 m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd);
00307
00308 float sample_weight = 1./num_pairs;
00309 Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid);
00310
00311
00312 for (size_t k = 1; k < aln.size(); k += 2)
00313 aln[k] += rec.s2 - rec.s1;
00314
00315 std::vector<uint64_t> seen; seen.reserve(10);
00316
00317
00318
00319
00320
00321
00322 size_t max_evidence = 0;
00323 for (size_t s = rec.s1; s <= rec.s2; ++s)
00324 {
00325 TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1;
00326 SPTR<tsa_iter> b = I.find(o + s, rec.e1 - s);
00327 UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found");
00328
00329 for (size_t i = rec.e1; i <= rec.e2; ++i)
00330 {
00331 uint64_t tpid = b->getPid();
00332 if (find(seen.begin(), seen.end(), tpid) != seen.end())
00333 continue;
00334 seen.push_back(tpid);
00335 size_t raw2 = b->approxOccurrenceCount();
00336 size_t evid = m_stats->add(tpid, sample_weight,
00337 m_bias ? (*m_bias)[p.sid] : 1,
00338 aln, raw2, rec.po_fwd, rec.po_bwd, docid,
00339 p.sid);
00340 max_evidence = std::max(max_evidence, evid);
00341 bool ok = (i == rec.e2) || b->extend(o[i].id());
00342 UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
00343 }
00344 if (s < rec.s2)
00345 for (size_t k = 1; k < aln.size(); k += 2)
00346 --aln[k];
00347 }
00348 return max_evidence;
00349 }
00350
00351 #ifndef MMT
00352 template<typename Token>
00353 bool
00354 BitextSampler<Token>::
00355 operator()()
00356 {
00357 if (m_finished) return true;
00358 boost::unique_lock<boost::mutex> lock(m_lock);
00359 if (m_method == full_coverage)
00360 perform_full_phrase_extraction();
00361 else if (m_method == random_sampling)
00362 perform_random_sampling();
00363 else UTIL_THROW2("Unsupported sampling method.");
00364 m_finished = true;
00365 m_ready.notify_all();
00366 return true;
00367 }
00368 #endif
00369
00370
00371 template<typename Token>
00372 bool
00373 BitextSampler<Token>::
00374 done() const
00375 {
00376 return m_next == m_stop;
00377 }
00378
00379 template<typename Token>
00380 SPTR<pstats>
00381 BitextSampler<Token>::
00382 stats()
00383 {
00384
00385
00386
00387
00388 return m_stats;
00389 }
00390
00391 template<typename Token>
00392 BitextSampler<Token>::
00393 ~BitextSampler()
00394 {
00395 m_stats->release();
00396 }
00397
00398 }
00399