Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/mm/ug

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 #pragma once
00003 // Implementations of word-aligned bitext.
00004 // Written by Ulrich Germann
00005 //
00006 // mmBitext: static, memory-mapped bitext
00007 // imBitext: dynamic, in-memory bitext
00008 //
00009 
00010 // things we can do to speed up things:
00011 // - set up threads at startup time that force the
00012 //   data in to memory sequentially
00013 //
00014 // - use multiple agendas for better load balancing and to avoid
00015 //   competition for locks
00016 //
00017 
00018 
00019 #define UG_BITEXT_TRACK_ACTIVE_THREADS 0
00020 
00021 #include <string>
00022 #include <vector>
00023 #include <cassert>
00024 #include <iomanip>
00025 #include <algorithm>
00026 
00027 #include <boost/foreach.hpp>
00028 #include <boost/random.hpp>
00029 #include <boost/format.hpp>
00030 #include <boost/thread.hpp>
00031 #include <boost/unordered_map.hpp>
00032 #include <boost/math/distributions/binomial.hpp>
00033 
00034 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00035 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00036 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00037 #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
00038 #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
00039 // #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
00040 #include "moses/Util.h"
00041 
00042 #ifndef NO_MOSES
00043 // #pragma message "COMPILING WITH MOSES SUPPORT!"
00044 #include "moses/StaticData.h"
00045 #include "moses/thread_safe_container.h"
00046 #include "moses/ContextScope.h"
00047 #include "moses/TranslationTask.h"
00048 #else
00049 // #pragma message "COMPILING WITHOUT MOSES SUPPORT!"
00050 #endif
00051 
00052 #include "util/exception.hh"
00053 // #include "util/check.hh"
00054 
00055 #include "ug_typedefs.h"
00056 #include "ug_mm_ttrack.h"
00057 #include "ug_im_ttrack.h"
00058 #include "ug_mm_tsa.h"
00059 #include "ug_im_tsa.h"
00060 #include "tpt_tokenindex.h"
00061 #include "ug_corpus_token.h"
00062 #include "tpt_pickler.h"
00063 #include "ug_lexical_phrase_scorer2.h"
00064 #include "ug_lru_cache.h"
00065 #include "ug_lexical_reordering.h"
00066 #include "ug_sampling_bias.h"
00067 #include "ug_phrasepair.h"
00068 #include "ug_bitext_phrase_extraction_record.h"
00069 #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
00070 
00071 // Minimum source count for caching phrase lookup statistics.
00072 // If source phrase occurs less frequently, never cache; 
00073 // always re-compute.
00074 #define PSTATS_CACHE_THRESHOLD 50
00075 
00076 namespace Moses { class Mmsapt; }
00077 namespace sapt
00078 {
00079   using Moses::ttasksptr;
00080   using Moses::ttaskwptr;
00081   using tpt::binread;
00082   using tpt::binwrite;
00083 
00084   float lbop(size_t const tries, size_t const succ, float const confidence);
00085   void write_bitvector(bitvector const& v, std::ostream& out);
00086 
00087 #ifndef NO_MOSES
00088   struct
00089   ContextForQuery
00090   {
00091     // needs to be made thread-safe
00092     // ttasksptr const m_ttask;
00093     // size_t max_samples;
00094     boost::shared_mutex lock;
00095     SPTR<SamplingBias> bias;
00096     SPTR<pstats::cache_t> cache1, cache2;
00097     std::ostream* bias_log;
00098     ContextForQuery() : bias_log(NULL) { }
00099   };
00100 #endif
00101 
00102   template<typename Token> class BitextSampler;
00103   
00104   template<typename TKN>
00105   class Bitext // : public Moses::reference_counter
00106   {
00107   public:
00108     template<typename Token> friend class BitextSampler;
00109     typedef TKN Token;
00110     typedef typename TSA<Token>::tree_iterator   iter;
00111     typedef typename std::vector<PhrasePair<Token> > vec_ppair;
00112     typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
00113     typedef TSA<Token> tsa;
00114     friend class Moses::Mmsapt;
00115   protected:
00116     mutable boost::shared_mutex m_lock; // for thread-safe operation
00117 
00118     class agenda; // for parallel sampling see ug_bitext_agenda.h
00119     mutable SPTR<agenda> ag;
00120     size_t m_num_workers; // number of workers available to the agenda
00121 
00122     size_t m_default_sample_size;
00123     size_t m_pstats_cache_threshold; // threshold for caching sampling results
00124     SPTR<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
00125 
00126     std::vector<std::string> m_docname;
00127     std::map<std::string,id_type>  m_docname2docid; // maps from doc names to ids
00128     SPTR<std::vector<id_type> >   m_sid2docid; // maps from sentences to docs (ids)
00129 
00130     mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
00131     // caches for unbiased sampling; biased sampling uses the caches that
00132     // are stored locally on the translation task
00133   public:
00134     SPTR<Ttrack<char> >  Tx; // word alignments
00135     SPTR<Ttrack<Token> > T1; // token track
00136     SPTR<Ttrack<Token> > T2; // token track
00137     SPTR<TokenIndex>     V1; // vocab
00138     SPTR<TokenIndex>     V2; // vocab
00139     SPTR<TSA<Token> >    I1; // indices
00140     SPTR<TSA<Token> >    I2; // indices
00141 
00143     //  find the possible start (s1 .. s2) and end (e1 .. e2)
00144     //  points of the target phrase; if non-NULL, store word
00145     //  alignments in *core_alignment. If /flip/, source phrase is
00146     //  L2.
00147     bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
00148     bool find_trg_phr_bounds
00149     ( size_t const sid,    // sentence to investigate
00150       size_t const start,  // start of source phrase
00151       size_t const stop,   // last position of source phrase
00152       size_t & s1, size_t & s2, // beginning and end of target start
00153       size_t & e1, size_t & e2, // beginning and end of target end
00154       int& po_fwd, int& po_bwd, // phrase orientations
00155       std::vector<unsigned char> * core_alignment, // stores the core alignment
00156       bitvector* full_alignment, // stores full word alignment for this sent.
00157       bool const flip) const;   // flip source and target (reverse lookup)
00158 
00159     // prep2 launches sampling and returns immediately.
00160     // lookup (below) waits for the job to finish before it returns
00161     SPTR<pstats>
00162     prep2(iter const& phrase, int max_sample = -1) const;
00163 
00164 #ifndef NO_MOSES
00165     SPTR<pstats>
00166     prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
00167           int max_sample = -1) const;
00168 #endif 
00169 
00170   protected:
00171     Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
00172 
00173     Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
00174            Ttrack<char>*  const tx,
00175            TokenIndex*    const v1, TokenIndex*    const v2,
00176            TSA<Token>*    const i1, TSA<Token>*    const i2,
00177            size_t const max_sample=1000,
00178            size_t const xnum_workers=16);
00179   public:
00180     virtual void
00181     open(std::string const base, std::string const L1, std::string const L2) = 0;
00182 
00183     SPTR<pstats> 
00184     lookup(iter const& phrase, int max_sample = -1) const;
00185 
00186     void prep(iter const& phrase) const;
00187 
00188 #ifndef NO_MOSES
00189     SPTR<pstats>
00190     lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
00191 
00192     void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
00193 #endif
00194 
00195     void   setDefaultSampleSize(size_t const max_samples);
00196     size_t getDefaultSampleSize() const;
00197 
00198     std::string toString(uint64_t pid, int isL2) const;
00199 
00200     virtual size_t revision() const { return 0; }
00201 
00202     SPTR<SentenceBias>
00203     loadSentenceBias(std::string const& fname) const;
00204 
00205     SPTR<DocumentBias>
00206     SetupDocumentBias(std::string const& bserver, std::string const& text, 
00207                       std::ostream* log) const;
00208 
00209     SPTR<DocumentBias>
00210     SetupDocumentBias(std::map<std::string,float> context_weights, 
00211                       std::ostream* log) const;
00212 
00213     void
00214     mark_match(Token const* start, Token const* end, iter const& m,
00215                bitvector& check) const;
00216     void
00217     write_yawat_alignment
00218     ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
00219 
00220     std::string sid2docname(id_type const sid) const;
00221     std::string docid2name(id_type const sid) const;
00222     int docname2docid(std::string const& name) const;
00223     
00224     std::vector<id_type> const* sid2did() const;
00225     int sid2did(uint32_t sid) const;
00226   };
00227 
00228   #include "ug_bitext_agenda.h"
00229 
00230   template<typename Token>
00231   int
00232   Bitext<Token>::
00233   docname2docid(std::string const& name) const
00234   {
00235     std::map<std::string,id_type>::const_iterator m;
00236     m = m_docname2docid.find(name);
00237     if (m != m_docname2docid.end()) return m->second;
00238     return -1;
00239   }
00240 
00241   template<typename Token>
00242   std::string
00243   Bitext<Token>::
00244   docid2name(id_type const did) const
00245   {
00246     if (did < m_docname.size())
00247       return m_docname[did];
00248     else
00249       return (boost::format("%d") % did).str();
00250   }
00251 
00252   template<typename Token>
00253   std::string
00254   Bitext<Token>::
00255   sid2docname(id_type const sid) const
00256   {
00257     if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
00258       return m_docname[(*m_sid2docid)[sid]];
00259     else
00260       return "";
00261   }
00262 
00263   template<typename Token>
00264   std::vector<id_type> const*
00265   Bitext<Token>::
00266   sid2did() const
00267   {
00268     return m_sid2docid.get();
00269   }
00270 
00271   template<typename Token>
00272   int
00273   Bitext<Token>::
00274   sid2did(uint32_t sid) const
00275   {
00276     if (m_sid2docid) 
00277       return m_sid2docid->at(sid);
00278     return -1;
00279   }
00280 
00281 
00282   template<typename Token>
00283   SPTR<SentenceBias>
00284   Bitext<Token>::
00285   loadSentenceBias(std::string const& fname) const
00286   {
00287     SPTR<SentenceBias> ret(new SentenceBias(T1->size()));
00288     std::ifstream in(fname.c_str());
00289     size_t i = 0;
00290     float v; while (in>>v) (*ret)[i++] = v;
00291     UTIL_THROW_IF2(i != T1->size(),
00292                    "Mismatch between bias vector size and corpus size at "
00293                    << HERE);
00294     return ret;
00295   }
00296 
00297   template<typename Token>
00298   std::string
00299   Bitext<Token>::
00300   toString(uint64_t pid, int isL2) const
00301   {
00302     std::ostringstream buf;
00303     uint32_t sid,off,len; parse_pid(pid,sid,off,len);
00304     Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
00305     Token const* x = t + len;
00306     TokenIndex const& V = isL2 ? *V2 : *V1;
00307     while (t < x)
00308       {
00309         buf << V[t->id()];
00310         if (++t < x) buf << " ";
00311       }
00312     return buf.str();
00313   }
00314 
00315   template<typename Token>
00316   size_t
00317   Bitext<Token>::
00318   getDefaultSampleSize() const
00319   {
00320     return m_default_sample_size;
00321   }
00322   template<typename Token>
00323   void
00324   Bitext<Token>::
00325   setDefaultSampleSize(size_t const max_samples)
00326   {
00327     boost::unique_lock<boost::shared_mutex> guard(m_lock);
00328     if (max_samples != m_default_sample_size)
00329       {
00330         m_cache1.reset(new pstats::cache_t);
00331         m_cache2.reset(new pstats::cache_t);
00332         m_default_sample_size = max_samples;
00333       }
00334   }
00335 
00336   template<typename Token>
00337   Bitext<Token>::
00338   Bitext(size_t const max_sample, size_t const xnum_workers)
00339     : m_num_workers(xnum_workers)
00340     , m_default_sample_size(max_sample)
00341     , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
00342     , m_cache1(new pstats::cache_t)
00343     , m_cache2(new pstats::cache_t)
00344   { }
00345 
00346   template<typename Token>
00347   Bitext<Token>::
00348   Bitext(Ttrack<Token>* const t1,
00349          Ttrack<Token>* const t2,
00350          Ttrack<char>*  const tx,
00351          TokenIndex*    const v1,
00352          TokenIndex*    const v2,
00353          TSA<Token>* const i1,
00354          TSA<Token>* const i2,
00355          size_t const max_sample,
00356          size_t const xnum_workers)
00357     : m_num_workers(xnum_workers)
00358     , m_default_sample_size(max_sample)
00359     , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
00360     , m_cache1(new pstats::cache_t)
00361     , m_cache2(new pstats::cache_t)
00362     , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
00363   { }
00364 
00365   template<typename TKN> class snt_adder;
00366   template<>             class snt_adder<L2R_Token<SimpleWordId> >;
00367 
00368   template<>
00369   class snt_adder<L2R_Token<SimpleWordId> >
00370   {
00371     typedef L2R_Token<SimpleWordId> TKN;
00372     std::vector<std::string> const & snt;
00373     TokenIndex           & V;
00374     SPTR<imTtrack<TKN> > & track;
00375     SPTR<imTSA<TKN > >   & index;
00376   public:
00377     snt_adder(std::vector<std::string> const& s, TokenIndex& v,
00378               SPTR<imTtrack<TKN> >& t, SPTR<imTSA<TKN> >& i);
00379 
00380     void operator()();
00381   };
00382 
00383   template<typename Token>
00384   bool
00385   Bitext<Token>::
00386   find_trg_phr_bounds(PhraseExtractionRecord& rec) const
00387   {
00388     return find_trg_phr_bounds(rec.sid, rec.start, rec.stop,
00389                                rec.s1, rec.s2, rec.e1, rec.e2,
00390                                rec.po_fwd, rec.po_bwd, 
00391                                rec.aln, rec.full_aln, rec.flip);
00392   }
00393 
00394   template<typename Token>
00395   bool
00396   Bitext<Token>::
00397   find_trg_phr_bounds
00398   ( size_t const sid,    // sentence to investigate
00399     size_t const start,  // start of source phrase
00400     size_t const stop,   // last position of source phrase
00401     size_t & s1, size_t & s2, // beginning and end of target start
00402     size_t & e1, size_t & e2, // beginning and end of target end
00403     int& po_fwd, int& po_bwd, // phrase orientations
00404     std::vector<unsigned char> * core_alignment, // stores the core alignment
00405     bitvector* full_alignment, // stores full word alignment for this sent.
00406     bool const flip) const     // flip source and target (reverse lookup)
00407   {
00408     // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
00409     // a word on the core_alignment (core_alignment):
00410     //
00411     // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
00412     // < e2, respectively) are be definition unaligned, we store
00413     // only the core alignment in *aln. It is up to the calling
00414     // function to shift alignment points over for start positions
00415     // of extracted phrases that start with a fringe word
00416     assert(T1);
00417     assert(T2);
00418     assert(Tx);
00419 
00420     size_t slen1,slen2;
00421     if (flip)
00422       {
00423         slen1 = T2->sntLen(sid);
00424         slen2 = T1->sntLen(sid);
00425       }
00426     else
00427       {
00428         slen1 = T1->sntLen(sid);
00429         slen2 = T2->sntLen(sid);
00430       }
00431     bitvector forbidden(slen2);
00432     if (full_alignment)
00433       {
00434         if (slen1*slen2 > full_alignment->size())
00435           full_alignment->resize(slen1*slen2*2);
00436         full_alignment->reset();
00437       }
00438     size_t src,trg;
00439     size_t lft = forbidden.size();
00440     size_t rgt = 0;
00441     std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
00442 
00443     // process word alignment for this sentence
00444     char const* p = Tx->sntStart(sid);
00445     char const* x = Tx->sntEnd(sid);
00446     while (p < x)
00447       {
00448         if (flip) 
00449           { 
00450             p = binread(p,trg); 
00451             assert(p<x); 
00452             p = binread(p,src); 
00453           }
00454         else 
00455           { 
00456             p = binread(p,src); 
00457             assert(p<x); 
00458             p = binread(p,trg); 
00459           }
00460           
00461         UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
00462                        "Alignment range error at sentence " << sid << "!\n"
00463                        << src << "/" << slen1 << " " << trg << "/" << slen2);
00464           
00465         if (src < start || src >= stop)
00466           forbidden.set(trg);
00467         else
00468           {
00469             lft = std::min(lft,trg);
00470             rgt = std::max(rgt,trg);
00471           }
00472         if (core_alignment)
00473           {
00474             aln1[src].push_back(trg);
00475             aln2[trg].push_back(src);
00476           }
00477         if (full_alignment)
00478           full_alignment->set(src*slen2 + trg);
00479       }
00480 
00481     for (size_t i = lft; i <= rgt; ++i)
00482       if (forbidden[i])
00483         return false;
00484 
00485     s2 = lft;   for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
00486     e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
00487 
00488     if (lft > rgt) return false;
00489     if (core_alignment)
00490       {
00491         core_alignment->clear();
00492         for (size_t i = start; i < stop; ++i)
00493           {
00494             BOOST_FOREACH(ushort x, aln1[i])
00495               {
00496                 core_alignment->push_back(i - start);
00497                 core_alignment->push_back(x - lft);
00498               }
00499           }
00500         // now determine fwd and bwd phrase orientation
00501         po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
00502         po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
00503       }
00504     return lft <= rgt;
00505   }
00506 
00507   template<typename Token>
00508   SPTR<DocumentBias>
00509   Bitext<Token>::
00510   SetupDocumentBias
00511   ( std::string const& bserver, std::string const& text, std::ostream* log ) const
00512   {
00513     SPTR<DocumentBias> ret;
00514     UTIL_THROW_IF2(m_sid2docid == NULL,
00515                    "Document bias requested but no document map loaded.");
00516     ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
00517                                bserver, text, log));
00518     return ret;
00519   }
00520 
00521   template<typename Token>
00522   SPTR<DocumentBias>
00523   Bitext<Token>::
00524   SetupDocumentBias
00525   ( std::map<std::string,float> context_weights, std::ostream* log ) const
00526   {
00527     SPTR<DocumentBias> ret;
00528     UTIL_THROW_IF2(m_sid2docid == NULL,
00529                    "Document bias requested but no document map loaded.");
00530     ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
00531                                context_weights, log));
00532     return ret;
00533   }
00534 
00535   template<typename Token>
00536   void
00537   Bitext<Token>::
00538   prep(iter const& phrase) const
00539   {
00540     prep2(phrase, m_default_sample_size);
00541   }
00542 
00543 
00544 
00545   // prep2 schedules a phrase for sampling, and returns immediately
00546   // the member function lookup retrieves the respective pstats instance
00547   // and waits until the sampling is finished before it returns.
00548   // This allows sampling in the background
00549   template<typename Token>
00550   SPTR<pstats>
00551   Bitext<Token>
00552   ::prep2
00553   (iter const& phrase, int max_sample) const
00554   {
00555     if (max_sample < 0) max_sample = m_default_sample_size;
00556     SPTR<SamplingBias> bias;
00557     SPTR<pstats::cache_t> cache;
00558     // - no caching for rare phrases and special requests (max_sample)
00559     //   (still need to test what a good caching threshold is ...)
00560     // - use the task-specific cache when there is a sampling bias
00561     if (max_sample == int(m_default_sample_size)
00562         && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
00563       {
00564         cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
00565       }
00566 
00567     SPTR<pstats> ret;
00568     SPTR<pstats> const* cached;
00569 
00570     if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
00571       return *cached;
00572     boost::unique_lock<boost::shared_mutex> guard(m_lock);
00573     if (!ag)
00574       {
00575         ag.reset(new agenda(*this));
00576         if (m_num_workers > 1)
00577           ag->add_workers(m_num_workers);
00578       }
00579     ret = ag->add_job(this, phrase, max_sample, bias);
00580     if (cache) cache->set(phrase.getPid(),ret);
00581     UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
00582     return ret;
00583   }
00584 
00585   // worker for scoring and sorting phrase table entries in parallel
00586   template<typename Token>
00587   class pstats2pplist
00588   {
00589     Ttrack<Token> const& m_other;
00590     SPTR<pstats> m_pstats;
00591     std::vector<PhrasePair<Token> >& m_pplist;
00592     typename PhrasePair<Token>::Scorer const* m_scorer;
00593     PhrasePair<Token> m_pp;
00594     Token const* m_token;
00595     size_t m_len;
00596     uint64_t m_pid1;
00597     bool m_is_inverse;
00598   public:
00599 
00600     // CONSTRUCTOR
00601     pstats2pplist(typename TSA<Token>::tree_iterator const& m,
00602                   Ttrack<Token> const& other,
00603                   SPTR<pstats> const& ps,
00604                   std::vector<PhrasePair<Token> >& dest,
00605                   typename PhrasePair<Token>::Scorer const* scorer)
00606       : m_other(other)
00607       , m_pstats(ps)
00608       , m_pplist(dest)
00609       , m_scorer(scorer)
00610       , m_token(m.getToken(0))
00611       , m_len(m.size())
00612       , m_pid1(m.getPid())
00613       , m_is_inverse(false)
00614     { }
00615 
00616     // WORKER
00617     void
00618     operator()()
00619     {
00620       // wait till all statistics have been collected
00621       boost::unique_lock<boost::mutex> lock(m_pstats->lock);
00622       while (m_pstats->in_progress)
00623         m_pstats->ready.wait(lock);
00624 
00625       m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
00626 
00627       // convert pstats entries to phrase pairs
00628       pstats::trg_map_t::iterator a;
00629       for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
00630         {
00631           uint32_t sid,off,len;
00632           parse_pid(a->first, sid, off, len);
00633           m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
00634           m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
00635                            m_pp.joint);
00636           // Poor man's early pruning: if p(f|e) or p(e|f) < 1/128, don't
00637           // even consider the phrase pair, as it is unlikely to ever be 
00638           // considered as a valid translation. 
00639           size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
00640           if (m_pp.good1 > J || m_pp.good2 > J) continue;
00641           if (m_scorer)
00642             {
00643               (*m_scorer)(m_pp);
00644             }
00645           m_pplist.push_back(m_pp);
00646         }
00647       std::greater<PhrasePair<Token> > sorter;
00648       if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
00649     }
00650   };
00651 
00652   template<typename Token>
00653   void
00654   Bitext<Token>
00655   ::mark_match(Token const* start, Token const* end,
00656                iter const& m, bitvector& check) const
00657   {
00658     check.resize(end-start);
00659     check.reset();
00660     Token const* x = m.getToken(0);
00661     for (Token const* s = start; s < end; ++s)
00662       {
00663         if (s->id() != x->id()) continue;
00664         Token const* a = x;
00665         Token const* b = s;
00666         size_t i = 0;
00667         while (a && b && a->id() == b->id() && i < m.size())
00668           {
00669             ++i;
00670             a = a->next();
00671             b = b->next();
00672           }
00673         if (i == m.size())
00674           {
00675             b = s;
00676             while (i-- > 0) { check.set(b-start); b = b->next(); }
00677           }
00678       }
00679   }
00680 
00681   template<typename Token>
00682   void
00683   Bitext<Token>::
00684   write_yawat_alignment
00685   ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const
00686   {
00687     std::vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
00688     bitvector f1(a1.size()), f2(a2.size());
00689     if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
00690     if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
00691 
00692     std::vector<std::pair<bitvector, bitvector> > agroups;
00693     std::vector<std::string> grouplabel;
00694     std::pair<bitvector, bitvector> ag;
00695     ag.first.resize(a1.size());
00696     ag.second.resize(a2.size());
00697     char const* x = Tx->sntStart(sid);
00698     size_t a, b;
00699     while (x < Tx->sntEnd(sid))
00700       {
00701         x = binread(x,a);
00702         x = binread(x,b);
00703         if (a1.at(a) < 0 && a2.at(b) < 0)
00704           {
00705             a1[a] = a2[b] = agroups.size();
00706             ag.first.reset();
00707             ag.second.reset();
00708             ag.first.set(a);
00709             ag.second.set(b);
00710             agroups.push_back(ag);
00711             grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
00712           }
00713         else if (a1.at(a) < 0)
00714           {
00715             a1[a] = a2[b];
00716             agroups[a2[b]].first.set(a);
00717             if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00718           }
00719         else if (a2.at(b) < 0)
00720           {
00721             a2[b] = a1[a];
00722             agroups[a1[a]].second.set(b);
00723             if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00724           }
00725         else
00726           {
00727             agroups[a1[a]].first  |= agroups[a2[b]].first;
00728             agroups[a1[a]].second |= agroups[a2[b]].second;
00729             a2[b] = a1[a];
00730             if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
00731           }
00732       }
00733 
00734     for (a = 0; a < a1.size(); ++a)
00735       {
00736         if (a1[a] < 0)
00737           {
00738             if (f1[a]) out << a << "::" << "infocusmono ";
00739             continue;
00740           }
00741         bitvector const& A = agroups[a1[a]].first;
00742         bitvector const& B = agroups[a1[a]].second;
00743         if (A.find_first() < a) continue;
00744         write_bitvector(A,out); out << ":";
00745         write_bitvector(B,out); out << ":";
00746         out << grouplabel[a1[a]] << " ";
00747       }
00748     for (b = 0; b < a2.size(); ++b)
00749       {
00750         if (a2[b] < 0 && f2[b])
00751           out <<  "::" << "infocusmono ";
00752       }
00753   }
00754 
00755   template<typename Token>
00756   void
00757   expand(typename Bitext<Token>::iter const& m,
00758          Bitext<Token> const& bt, pstats const& ps,
00759          std::vector<PhrasePair<Token> >& dest, std::ostream* log)
00760   {
00761     bool fwd = m.root == bt.I1.get();
00762     dest.reserve(ps.trg.size());
00763     PhrasePair<Token> pp;
00764     pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
00765     // cout << HERE << " "
00766     // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl;
00767     pstats::trg_map_t::const_iterator a;
00768     for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
00769       {
00770         uint32_t sid,off,len;
00771         parse_pid(a->first, sid, off, len);
00772         pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
00773                   len, a->second);
00774         dest.push_back(pp);
00775       }
00776   }
00777 
00778 } // end of namespace sapt
00779 
00780 #include "ug_im_bitext.h"
00781 #include "ug_mm_bitext.h"
00782 #include "ug_bitext_moses.h"
/disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h