00001 #ifndef __ug_bitext_base_h
00002 #define __ug_bitext_base_h
00003
00004
00005
00006 #include <string>
00007 #include <vector>
00008 #include <cassert>
00009 #include <iomanip>
00010 #include <algorithm>
00011
00012 #include <boost/unordered_map.hpp>
00013 #include <boost/foreach.hpp>
00014 #include <boost/thread.hpp>
00015
00016 #include "moses/generic/sorting/VectorIndexSorter.h"
00017 #include "moses/generic/sampling/Sampling.h"
00018 #include "moses/generic/file_io/ug_stream.h"
00019
00020 #include "ug_typedefs.h"
00021 #include "ug_mm_ttrack.h"
00022 #include "ug_mm_tsa.h"
00023 #include "tpt_tokenindex.h"
00024 #include "ug_corpus_token.h"
00025 #include "tpt_pickler.h"
00026
00027 namespace Moses {
00028
00029 typedef L2R_Token<SimpleWordId> Token;
00030 typedef mmTSA<Token>::tree_iterator iter;
00031
00032 class bitext_base
00033 {
00034 public:
00035 typedef mmTSA<Token>::tree_iterator iter;
00036 class pstats;
00037 class jstats;
00038 class agenda
00039 {
00040 boost::mutex lock;
00041 boost::condition_variable ready;
00042 class job;
00043 class worker;
00044 list<job> joblist;
00045 std::vector<SPTR<boost::thread> > workers;
00046 bool shutdown;
00047 size_t doomed;
00048 public:
00049 bitext_base const& bitext;
00050 agenda(bitext_base const& bitext);
00051 ~agenda();
00052 void add_workers(int n);
00053 SPTR<pstats> add_job(mmbitext::iter const& phrase,
00054 size_t const max_samples);
00055 bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
00056 bool & fwd, SPTR<bitext_base::pstats> & stats);
00057 };
00058
00059
00060
00061
00062 agenda* ag;
00063 mmTtrack<char> Tx;
00064 mmTtrack<Token> T1,T2;
00065 TokenIndex V1,V2;
00066 mmTSA<Token> I1,I2;
00067
00069
00070
00071
00072
00073 bool
00074 find_trg_phr_bounds
00075 (size_t const sid, size_t const start, size_t const stop,
00076 size_t & s1, size_t & s2, size_t & e1, size_t & e2,
00077 std::vector<uchar> * core_alignment, bool const flip) const;
00078
00079 boost::unordered_map<uint64_t,SPTR<pstats> > cache1,cache2;
00080 private:
00081 SPTR<pstats>
00082 prep2(iter const& phrase);
00083 public:
00084 mmbitext();
00085 ~mmbitext();
00086
00087 void open(std::string const base, std::string const L1, std::string const L2);
00088
00089 SPTR<pstats> lookup(iter const& phrase);
00090 void prep(iter const& phrase);
00091 };
00092
00093
00094 class
00095 mmbitext::
00096 jstats
00097 {
00098 uint32_t my_rcnt;
00099 float my_wcnt;
00100 std::vector<pair<size_t, std::vector<uchar> > > my_aln;
00101 boost::mutex lock;
00102 public:
00103 jstats();
00104 jstats(jstats const& other);
00105 uint32_t rcnt() const;
00106 float wcnt() const;
00107 std::vector<pair<size_t, std::vector<uchar> > > const & aln() const;
00108 void add(float w, std::vector<uchar> const& a);
00109 };
00110
00111 struct
00112 mmbitext::
00113 pstats
00114 {
00115 boost::mutex lock;
00116 boost::condition_variable ready;
00117
00118 size_t raw_cnt;
00119 size_t sample_cnt;
00120 size_t good;
00121 size_t sum_pairs;
00122
00123
00124 size_t in_progress;
00125 boost::unordered_map<uint64_t, jstats> trg;
00126 pstats();
00127
00128
00129 void release();
00130 void register_worker();
00131 void add(mmbitext::iter const& trg_phrase, float const w,
00132 std::vector<uchar> const& a);
00133 };
00134
00135 class
00136 mmbitext::
00137 agenda::
00138 worker
00139 {
00140 agenda& ag;
00141 public:
00142 worker(agenda& a);
00143 void operator()();
00144
00145 };
00146
00147 class
00148 mmbitext::
00149 agenda::
00150 job
00151 {
00152 public:
00153 char const* next;
00154 char const* stop;
00155 size_t max_samples;
00156 size_t ctr;
00157 size_t len;
00158 bool fwd;
00159 SPTR<mmbitext::pstats> stats;
00160 bool step(uint64_t & sid, uint64_t & offset);
00161 };
00162
00163 }
00164 #endif
00165