00001
00002 #pragma once
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #define UG_BITEXT_TRACK_ACTIVE_THREADS 0
00020
00021 #include <string>
00022 #include <vector>
00023 #include <cassert>
00024 #include <iomanip>
00025 #include <algorithm>
00026
00027 #include <boost/foreach.hpp>
00028 #include <boost/random.hpp>
00029 #include <boost/format.hpp>
00030 #include <boost/thread.hpp>
00031 #include <boost/unordered_map.hpp>
00032 #include <boost/math/distributions/binomial.hpp>
00033
00034 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00035 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00036 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00037 #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
00038 #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
00039
00040 #include "moses/Util.h"
00041
00042 #ifndef NO_MOSES
00043
00044 #include "moses/StaticData.h"
00045 #include "moses/thread_safe_container.h"
00046 #include "moses/ContextScope.h"
00047 #include "moses/TranslationTask.h"
00048 #else
00049
00050 #endif
00051
00052 #include "util/exception.hh"
00053
00054
00055 #include "ug_typedefs.h"
00056 #include "ug_mm_ttrack.h"
00057 #include "ug_im_ttrack.h"
00058 #include "ug_mm_tsa.h"
00059 #include "ug_im_tsa.h"
00060 #include "tpt_tokenindex.h"
00061 #include "ug_corpus_token.h"
00062 #include "tpt_pickler.h"
00063 #include "ug_lexical_phrase_scorer2.h"
00064 #include "ug_lru_cache.h"
00065 #include "ug_lexical_reordering.h"
00066 #include "ug_sampling_bias.h"
00067 #include "ug_phrasepair.h"
00068 #include "ug_bitext_phrase_extraction_record.h"
00069 #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
00070
00071
00072
00073
00074 #define PSTATS_CACHE_THRESHOLD 50
00075
00076 namespace Moses { class Mmsapt; }
00077 namespace sapt
00078 {
00079 using Moses::ttasksptr;
00080 using Moses::ttaskwptr;
00081 using tpt::binread;
00082 using tpt::binwrite;
00083
00084 float lbop(size_t const tries, size_t const succ, float const confidence);
00085 void write_bitvector(bitvector const& v, std::ostream& out);
00086
00087 #ifndef NO_MOSES
00088 struct
00089 ContextForQuery
00090 {
00091
00092
00093
00094 boost::shared_mutex lock;
00095 SPTR<SamplingBias> bias;
00096 SPTR<pstats::cache_t> cache1, cache2;
00097 std::ostream* bias_log;
00098 ContextForQuery() : bias_log(NULL) { }
00099 };
00100 #endif
00101
00102 template<typename Token> class BitextSampler;
00103
00104 template<typename TKN>
00105 class Bitext
00106 {
00107 public:
00108 template<typename Token> friend class BitextSampler;
00109 typedef TKN Token;
00110 typedef typename TSA<Token>::tree_iterator iter;
00111 typedef typename std::vector<PhrasePair<Token> > vec_ppair;
00112 typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
00113 typedef TSA<Token> tsa;
00114 friend class Moses::Mmsapt;
00115 protected:
00116 mutable boost::shared_mutex m_lock;
00117
00118 class agenda;
00119 mutable SPTR<agenda> ag;
00120 size_t m_num_workers;
00121
00122 size_t m_default_sample_size;
00123 size_t m_pstats_cache_threshold;
00124 SPTR<pstats::cache_t> m_cache1, m_cache2;
00125
00126 std::vector<std::string> m_docname;
00127 std::map<std::string,id_type> m_docname2docid;
00128 SPTR<std::vector<id_type> > m_sid2docid;
00129
00130 mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
00131
00132
00133 public:
00134 SPTR<Ttrack<char> > Tx;
00135 SPTR<Ttrack<Token> > T1;
00136 SPTR<Ttrack<Token> > T2;
00137 SPTR<TokenIndex> V1;
00138 SPTR<TokenIndex> V2;
00139 SPTR<TSA<Token> > I1;
00140 SPTR<TSA<Token> > I2;
00141
00143
00144
00145
00146
00147 bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
00148 bool find_trg_phr_bounds
00149 ( size_t const sid,
00150 size_t const start,
00151 size_t const stop,
00152 size_t & s1, size_t & s2,
00153 size_t & e1, size_t & e2,
00154 int& po_fwd, int& po_bwd,
00155 std::vector<unsigned char> * core_alignment,
00156 bitvector* full_alignment,
00157 bool const flip) const;
00158
00159
00160
00161 SPTR<pstats>
00162 prep2(iter const& phrase, int max_sample = -1) const;
00163
00164 #ifndef NO_MOSES
00165 SPTR<pstats>
00166 prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
00167 int max_sample = -1) const;
00168 #endif
00169
00170 protected:
00171 Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
00172
00173 Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
00174 Ttrack<char>* const tx,
00175 TokenIndex* const v1, TokenIndex* const v2,
00176 TSA<Token>* const i1, TSA<Token>* const i2,
00177 size_t const max_sample=1000,
00178 size_t const xnum_workers=16);
00179 public:
00180 virtual void
00181 open(std::string const base, std::string const L1, std::string const L2) = 0;
00182
00183 SPTR<pstats>
00184 lookup(iter const& phrase, int max_sample = -1) const;
00185
00186 void prep(iter const& phrase) const;
00187
00188 #ifndef NO_MOSES
00189 SPTR<pstats>
00190 lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
00191
00192 void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
00193 #endif
00194
00195 void setDefaultSampleSize(size_t const max_samples);
00196 size_t getDefaultSampleSize() const;
00197
00198 std::string toString(uint64_t pid, int isL2) const;
00199
00200 virtual size_t revision() const { return 0; }
00201
00202 SPTR<SentenceBias>
00203 loadSentenceBias(std::string const& fname) const;
00204
00205 SPTR<DocumentBias>
00206 SetupDocumentBias(std::string const& bserver, std::string const& text,
00207 std::ostream* log) const;
00208
00209 SPTR<DocumentBias>
00210 SetupDocumentBias(std::map<std::string,float> context_weights,
00211 std::ostream* log) const;
00212
00213 void
00214 mark_match(Token const* start, Token const* end, iter const& m,
00215 bitvector& check) const;
00216 void
00217 write_yawat_alignment
00218 ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
00219
00220 std::string sid2docname(id_type const sid) const;
00221 std::string docid2name(id_type const sid) const;
00222 int docname2docid(std::string const& name) const;
00223
00224 std::vector<id_type> const* sid2did() const;
00225 int sid2did(uint32_t sid) const;
00226 };
00227
00228 #include "ug_bitext_agenda.h"
00229
00230 template<typename Token>
00231 int
00232 Bitext<Token>::
00233 docname2docid(std::string const& name) const
00234 {
00235 std::map<std::string,id_type>::const_iterator m;
00236 m = m_docname2docid.find(name);
00237 if (m != m_docname2docid.end()) return m->second;
00238 return -1;
00239 }
00240
00241 template<typename Token>
00242 std::string
00243 Bitext<Token>::
00244 docid2name(id_type const did) const
00245 {
00246 if (did < m_docname.size())
00247 return m_docname[did];
00248 else
00249 return (boost::format("%d") % did).str();
00250 }
00251
00252 template<typename Token>
00253 std::string
00254 Bitext<Token>::
00255 sid2docname(id_type const sid) const
00256 {
00257 if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
00258 return m_docname[(*m_sid2docid)[sid]];
00259 else
00260 return "";
00261 }
00262
00263 template<typename Token>
00264 std::vector<id_type> const*
00265 Bitext<Token>::
00266 sid2did() const
00267 {
00268 return m_sid2docid.get();
00269 }
00270
00271 template<typename Token>
00272 int
00273 Bitext<Token>::
00274 sid2did(uint32_t sid) const
00275 {
00276 if (m_sid2docid)
00277 return m_sid2docid->at(sid);
00278 return -1;
00279 }
00280
00281
00282 template<typename Token>
00283 SPTR<SentenceBias>
00284 Bitext<Token>::
00285 loadSentenceBias(std::string const& fname) const
00286 {
00287 SPTR<SentenceBias> ret(new SentenceBias(T1->size()));
00288 std::ifstream in(fname.c_str());
00289 size_t i = 0;
00290 float v; while (in>>v) (*ret)[i++] = v;
00291 UTIL_THROW_IF2(i != T1->size(),
00292 "Mismatch between bias vector size and corpus size at "
00293 << HERE);
00294 return ret;
00295 }
00296
00297 template<typename Token>
00298 std::string
00299 Bitext<Token>::
00300 toString(uint64_t pid, int isL2) const
00301 {
00302 std::ostringstream buf;
00303 uint32_t sid,off,len; parse_pid(pid,sid,off,len);
00304 Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
00305 Token const* x = t + len;
00306 TokenIndex const& V = isL2 ? *V2 : *V1;
00307 while (t < x)
00308 {
00309 buf << V[t->id()];
00310 if (++t < x) buf << " ";
00311 }
00312 return buf.str();
00313 }
00314
00315 template<typename Token>
00316 size_t
00317 Bitext<Token>::
00318 getDefaultSampleSize() const
00319 {
00320 return m_default_sample_size;
00321 }
00322 template<typename Token>
00323 void
00324 Bitext<Token>::
00325 setDefaultSampleSize(size_t const max_samples)
00326 {
00327 boost::unique_lock<boost::shared_mutex> guard(m_lock);
00328 if (max_samples != m_default_sample_size)
00329 {
00330 m_cache1.reset(new pstats::cache_t);
00331 m_cache2.reset(new pstats::cache_t);
00332 m_default_sample_size = max_samples;
00333 }
00334 }
00335
00336 template<typename Token>
00337 Bitext<Token>::
00338 Bitext(size_t const max_sample, size_t const xnum_workers)
00339 : m_num_workers(xnum_workers)
00340 , m_default_sample_size(max_sample)
00341 , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
00342 , m_cache1(new pstats::cache_t)
00343 , m_cache2(new pstats::cache_t)
00344 { }
00345
00346 template<typename Token>
00347 Bitext<Token>::
00348 Bitext(Ttrack<Token>* const t1,
00349 Ttrack<Token>* const t2,
00350 Ttrack<char>* const tx,
00351 TokenIndex* const v1,
00352 TokenIndex* const v2,
00353 TSA<Token>* const i1,
00354 TSA<Token>* const i2,
00355 size_t const max_sample,
00356 size_t const xnum_workers)
00357 : m_num_workers(xnum_workers)
00358 , m_default_sample_size(max_sample)
00359 , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
00360 , m_cache1(new pstats::cache_t)
00361 , m_cache2(new pstats::cache_t)
00362 , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
00363 { }
00364
00365 template<typename TKN> class snt_adder;
00366 template<> class snt_adder<L2R_Token<SimpleWordId> >;
00367
00368 template<>
00369 class snt_adder<L2R_Token<SimpleWordId> >
00370 {
00371 typedef L2R_Token<SimpleWordId> TKN;
00372 std::vector<std::string> const & snt;
00373 TokenIndex & V;
00374 SPTR<imTtrack<TKN> > & track;
00375 SPTR<imTSA<TKN > > & index;
00376 public:
00377 snt_adder(std::vector<std::string> const& s, TokenIndex& v,
00378 SPTR<imTtrack<TKN> >& t, SPTR<imTSA<TKN> >& i);
00379
00380 void operator()();
00381 };
00382
00383 template<typename Token>
00384 bool
00385 Bitext<Token>::
00386 find_trg_phr_bounds(PhraseExtractionRecord& rec) const
00387 {
00388 return find_trg_phr_bounds(rec.sid, rec.start, rec.stop,
00389 rec.s1, rec.s2, rec.e1, rec.e2,
00390 rec.po_fwd, rec.po_bwd,
00391 rec.aln, rec.full_aln, rec.flip);
00392 }
00393
00394 template<typename Token>
00395 bool
00396 Bitext<Token>::
00397 find_trg_phr_bounds
00398 ( size_t const sid,
00399 size_t const start,
00400 size_t const stop,
00401 size_t & s1, size_t & s2,
00402 size_t & e1, size_t & e2,
00403 int& po_fwd, int& po_bwd,
00404 std::vector<unsigned char> * core_alignment,
00405 bitvector* full_alignment,
00406 bool const flip) const
00407 {
00408
00409
00410
00411
00412
00413
00414
00415
00416 assert(T1);
00417 assert(T2);
00418 assert(Tx);
00419
00420 size_t slen1,slen2;
00421 if (flip)
00422 {
00423 slen1 = T2->sntLen(sid);
00424 slen2 = T1->sntLen(sid);
00425 }
00426 else
00427 {
00428 slen1 = T1->sntLen(sid);
00429 slen2 = T2->sntLen(sid);
00430 }
00431 bitvector forbidden(slen2);
00432 if (full_alignment)
00433 {
00434 if (slen1*slen2 > full_alignment->size())
00435 full_alignment->resize(slen1*slen2*2);
00436 full_alignment->reset();
00437 }
00438 size_t src,trg;
00439 size_t lft = forbidden.size();
00440 size_t rgt = 0;
00441 std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
00442
00443
00444 char const* p = Tx->sntStart(sid);
00445 char const* x = Tx->sntEnd(sid);
00446 while (p < x)
00447 {
00448 if (flip)
00449 {
00450 p = binread(p,trg);
00451 assert(p<x);
00452 p = binread(p,src);
00453 }
00454 else
00455 {
00456 p = binread(p,src);
00457 assert(p<x);
00458 p = binread(p,trg);
00459 }
00460
00461 UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
00462 "Alignment range error at sentence " << sid << "!\n"
00463 << src << "/" << slen1 << " " << trg << "/" << slen2);
00464
00465 if (src < start || src >= stop)
00466 forbidden.set(trg);
00467 else
00468 {
00469 lft = std::min(lft,trg);
00470 rgt = std::max(rgt,trg);
00471 }
00472 if (core_alignment)
00473 {
00474 aln1[src].push_back(trg);
00475 aln2[trg].push_back(src);
00476 }
00477 if (full_alignment)
00478 full_alignment->set(src*slen2 + trg);
00479 }
00480
00481 for (size_t i = lft; i <= rgt; ++i)
00482 if (forbidden[i])
00483 return false;
00484
00485 s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
00486 e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
00487
00488 if (lft > rgt) return false;
00489 if (core_alignment)
00490 {
00491 core_alignment->clear();
00492 for (size_t i = start; i < stop; ++i)
00493 {
00494 BOOST_FOREACH(ushort x, aln1[i])
00495 {
00496 core_alignment->push_back(i - start);
00497 core_alignment->push_back(x - lft);
00498 }
00499 }
00500
00501 po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
00502 po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
00503 }
00504 return lft <= rgt;
00505 }
00506
00507 template<typename Token>
00508 SPTR<DocumentBias>
00509 Bitext<Token>::
00510 SetupDocumentBias
00511 ( std::string const& bserver, std::string const& text, std::ostream* log ) const
00512 {
00513 SPTR<DocumentBias> ret;
00514 UTIL_THROW_IF2(m_sid2docid == NULL,
00515 "Document bias requested but no document map loaded.");
00516 ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
00517 bserver, text, log));
00518 return ret;
00519 }
00520
00521 template<typename Token>
00522 SPTR<DocumentBias>
00523 Bitext<Token>::
00524 SetupDocumentBias
00525 ( std::map<std::string,float> context_weights, std::ostream* log ) const
00526 {
00527 SPTR<DocumentBias> ret;
00528 UTIL_THROW_IF2(m_sid2docid == NULL,
00529 "Document bias requested but no document map loaded.");
00530 ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
00531 context_weights, log));
00532 return ret;
00533 }
00534
00535 template<typename Token>
00536 void
00537 Bitext<Token>::
00538 prep(iter const& phrase) const
00539 {
00540 prep2(phrase, m_default_sample_size);
00541 }
00542
00543
00544
00545
00546
00547
00548
00549 template<typename Token>
00550 SPTR<pstats>
00551 Bitext<Token>
00552 ::prep2
00553 (iter const& phrase, int max_sample) const
00554 {
00555 if (max_sample < 0) max_sample = m_default_sample_size;
00556 SPTR<SamplingBias> bias;
00557 SPTR<pstats::cache_t> cache;
00558
00559
00560
00561 if (max_sample == int(m_default_sample_size)
00562 && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
00563 {
00564 cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
00565 }
00566
00567 SPTR<pstats> ret;
00568 SPTR<pstats> const* cached;
00569
00570 if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
00571 return *cached;
00572 boost::unique_lock<boost::shared_mutex> guard(m_lock);
00573 if (!ag)
00574 {
00575 ag.reset(new agenda(*this));
00576 if (m_num_workers > 1)
00577 ag->add_workers(m_num_workers);
00578 }
00579 ret = ag->add_job(this, phrase, max_sample, bias);
00580 if (cache) cache->set(phrase.getPid(),ret);
00581 UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
00582 return ret;
00583 }
00584
00585
00586 template<typename Token>
00587 class pstats2pplist
00588 {
00589 Ttrack<Token> const& m_other;
00590 SPTR<pstats> m_pstats;
00591 std::vector<PhrasePair<Token> >& m_pplist;
00592 typename PhrasePair<Token>::Scorer const* m_scorer;
00593 PhrasePair<Token> m_pp;
00594 Token const* m_token;
00595 size_t m_len;
00596 uint64_t m_pid1;
00597 bool m_is_inverse;
00598 public:
00599
00600
00601 pstats2pplist(typename TSA<Token>::tree_iterator const& m,
00602 Ttrack<Token> const& other,
00603 SPTR<pstats> const& ps,
00604 std::vector<PhrasePair<Token> >& dest,
00605 typename PhrasePair<Token>::Scorer const* scorer)
00606 : m_other(other)
00607 , m_pstats(ps)
00608 , m_pplist(dest)
00609 , m_scorer(scorer)
00610 , m_token(m.getToken(0))
00611 , m_len(m.size())
00612 , m_pid1(m.getPid())
00613 , m_is_inverse(false)
00614 { }
00615
00616
00617 void
00618 operator()()
00619 {
00620
00621 boost::unique_lock<boost::mutex> lock(m_pstats->lock);
00622 while (m_pstats->in_progress)
00623 m_pstats->ready.wait(lock);
00624
00625 m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
00626
00627
00628 pstats::trg_map_t::iterator a;
00629 for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
00630 {
00631 uint32_t sid,off,len;
00632 parse_pid(a->first, sid, off, len);
00633 m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
00634 m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
00635 m_pp.joint);
00636
00637
00638
00639 size_t J = m_pp.joint<<7;
00640 if (m_pp.good1 > J || m_pp.good2 > J) continue;
00641 if (m_scorer)
00642 {
00643 (*m_scorer)(m_pp);
00644 }
00645 m_pplist.push_back(m_pp);
00646 }
00647 std::greater<PhrasePair<Token> > sorter;
00648 if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
00649 }
00650 };
00651
00652 template<typename Token>
00653 void
00654 Bitext<Token>
00655 ::mark_match(Token const* start, Token const* end,
00656 iter const& m, bitvector& check) const
00657 {
00658 check.resize(end-start);
00659 check.reset();
00660 Token const* x = m.getToken(0);
00661 for (Token const* s = start; s < end; ++s)
00662 {
00663 if (s->id() != x->id()) continue;
00664 Token const* a = x;
00665 Token const* b = s;
00666 size_t i = 0;
00667 while (a && b && a->id() == b->id() && i < m.size())
00668 {
00669 ++i;
00670 a = a->next();
00671 b = b->next();
00672 }
00673 if (i == m.size())
00674 {
00675 b = s;
00676 while (i-- > 0) { check.set(b-start); b = b->next(); }
00677 }
00678 }
00679 }
00680
00681 template<typename Token>
00682 void
00683 Bitext<Token>::
00684 write_yawat_alignment
00685 ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const
00686 {
00687 std::vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
00688 bitvector f1(a1.size()), f2(a2.size());
00689 if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
00690 if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
00691
00692 std::vector<std::pair<bitvector, bitvector> > agroups;
00693 std::vector<std::string> grouplabel;
00694 std::pair<bitvector, bitvector> ag;
00695 ag.first.resize(a1.size());
00696 ag.second.resize(a2.size());
00697 char const* x = Tx->sntStart(sid);
00698 size_t a, b;
00699 while (x < Tx->sntEnd(sid))
00700 {
00701 x = binread(x,a);
00702 x = binread(x,b);
00703 if (a1.at(a) < 0 && a2.at(b) < 0)
00704 {
00705 a1[a] = a2[b] = agroups.size();
00706 ag.first.reset();
00707 ag.second.reset();
00708 ag.first.set(a);
00709 ag.second.set(b);
00710 agroups.push_back(ag);
00711 grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
00712 }
00713 else if (a1.at(a) < 0)
00714 {
00715 a1[a] = a2[b];
00716 agroups[a2[b]].first.set(a);
00717 if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00718 }
00719 else if (a2.at(b) < 0)
00720 {
00721 a2[b] = a1[a];
00722 agroups[a1[a]].second.set(b);
00723 if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00724 }
00725 else
00726 {
00727 agroups[a1[a]].first |= agroups[a2[b]].first;
00728 agroups[a1[a]].second |= agroups[a2[b]].second;
00729 a2[b] = a1[a];
00730 if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00731 }
00732 }
00733
00734 for (a = 0; a < a1.size(); ++a)
00735 {
00736 if (a1[a] < 0)
00737 {
00738 if (f1[a]) out << a << "::" << "infocusmono ";
00739 continue;
00740 }
00741 bitvector const& A = agroups[a1[a]].first;
00742 bitvector const& B = agroups[a1[a]].second;
00743 if (A.find_first() < a) continue;
00744 write_bitvector(A,out); out << ":";
00745 write_bitvector(B,out); out << ":";
00746 out << grouplabel[a1[a]] << " ";
00747 }
00748 for (b = 0; b < a2.size(); ++b)
00749 {
00750 if (a2[b] < 0 && f2[b])
00751 out << "::" << "infocusmono ";
00752 }
00753 }
00754
00755 template<typename Token>
00756 void
00757 expand(typename Bitext<Token>::iter const& m,
00758 Bitext<Token> const& bt, pstats const& ps,
00759 std::vector<PhrasePair<Token> >& dest, std::ostream* log)
00760 {
00761 bool fwd = m.root == bt.I1.get();
00762 dest.reserve(ps.trg.size());
00763 PhrasePair<Token> pp;
00764 pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
00765
00766
00767 pstats::trg_map_t::const_iterator a;
00768 for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
00769 {
00770 uint32_t sid,off,len;
00771 parse_pid(a->first, sid, off, len);
00772 pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
00773 len, a->second);
00774 dest.push_back(pp);
00775 }
00776 }
00777
00778 }
00779
00780 #include "ug_im_bitext.h"
00781 #include "ug_mm_bitext.h"
00782 #include "ug_bitext_moses.h"