00001
00002
00003
00004
00005
00006 template<typename Token>
00007 class
00008 Bitext<Token>::agenda::
00009 job
00010 {
00011 #if UG_BITEXT_TRACK_ACTIVE_THREADS
00012 static ThreadSafeCounter active;
00013 #endif
00014 Bitext<Token> const* const m_bitext;
00015 boost::mutex lock;
00016 friend class agenda;
00017 boost::taus88 rnd;
00018 double rnddenom;
00019 size_t min_diverse;
00020
00021 bool flip_coin(uint64_t & sid, uint64_t & offset);
00022 bool step(uint64_t & sid, uint64_t & offset);
00023
00024 public:
00025 size_t workers;
00026 SPTR<TSA<Token> const> root;
00027 char const* next;
00028 char const* stop;
00029 size_t max_samples;
00030 size_t ctr;
00031
00032
00033 size_t len;
00034 bool fwd;
00035 SPTR<pstats> stats;
00036 SPTR<SamplingBias const> const m_bias;
00037 float bias_total;
00038 bool m_track_sids;
00039
00040 bool nextSample(uint64_t & sid, uint64_t & offset);
00041
00042 int
00043 check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
00044
00045
00046
00047 bool done() const;
00048 job(Bitext<Token> const* const theBitext,
00049 typename TSA<Token>::tree_iterator const& m,
00050 SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
00051 SPTR<SamplingBias const> const& bias, bool const track_sids);
00052 ~job();
00053 };
00054
00055 template<typename Token>
00056 Bitext<Token>::agenda::job
00057 ::~job()
00058 {
00059 if (stats) stats.reset();
00060 #if UG_BITEXT_TRACK_ACTIVE_THREADS
00061
00062 try { --active; } catch (...) {}
00063 #endif
00064 }
00065
00066 template<typename Token>
00067 Bitext<Token>::agenda::job
00068 ::job(Bitext<Token> const* const theBitext,
00069 typename TSA<Token>::tree_iterator const& m,
00070 SPTR<TSA<Token> > const& r, size_t maxsmpl,
00071 bool isfwd, SPTR<SamplingBias const> const& bias,
00072 bool const track_sids)
00073 : m_bitext(theBitext)
00074 , rnd(0)
00075 , rnddenom(rnd.max() + 1.)
00076 , min_diverse(1)
00077 , workers(0)
00078 , root(r)
00079 , next(m.lower_bound(-1))
00080 , stop(m.upper_bound(-1))
00081 , max_samples(maxsmpl)
00082 , ctr(0)
00083 , len(m.size())
00084 , fwd(isfwd)
00085 , m_bias(bias)
00086 , m_track_sids(track_sids)
00087 {
00088 stats.reset(new pstats(m_track_sids));
00089 stats->raw_cnt = m.approxOccurrenceCount();
00090 bias_total = 0;
00091
00092
00093
00094
00095 if (m_bias)
00096 {
00097
00098 stats->raw_cnt = 0;
00099 for (char const* x = m.lower_bound(-1); x < stop;)
00100 {
00101 uint32_t sid; ushort offset;
00102 x = root->readSid(x,stop,sid);
00103 x = root->readOffset(x,stop,offset);
00104 #if 0
00105 cerr << ctr++ << " " << m.str(m_bitext->V1.get())
00106 << " " << sid << "/" << root->getCorpusSize()
00107 << " " << offset << " " << stop-x << std::endl;
00108 #endif
00109 bias_total += (*m_bias)[sid];
00110 ++stats->raw_cnt;
00111 }
00112 }
00113 #if UG_BITEXT_TRACK_ACTIVE_THREADS
00114 ++active;
00115
00116
00117 #endif
00118 }
00119
00120 template<typename Token>
00121 bool Bitext<Token>::agenda::job
00122 ::done() const
00123 {
00124 return (max_samples && stats->good >= max_samples) || next == stop;
00125 }
00126
00127 template<typename Token>
00128 int Bitext<Token>::agenda::job
00129 ::check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
00130 {
00131
00132
00133
00134
00135 if (!m_bias) return 1;
00136
00137 typedef boost::math::binomial_distribution<> binomial;
00138
00139 std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL;
00140
00141 float p = (*m_bias)[sid];
00142 id_type docid = m_bias->GetClass(sid);
00143
00144 typedef pstats::indoc_map_t::const_iterator id_iter;
00145 id_iter m = stats->indoc.find(docid);
00146 uint32_t k = m != stats->indoc.end() ? m->second : 0 ;
00147
00148
00149
00150 bool ret = (p > .5 || k == 0);
00151
00152 if (ret && !log) return 1;
00153
00154 uint32_t N = stats->good;
00155 float d = cdf(complement(binomial(N, p), k));
00156
00157 ret = ret || d >= .05;
00158
00159 if (log)
00160 {
00161 Token const* t = root->getCorpus()->sntStart(sid)+offset;
00162 Token const* x = t - std::min(offset,uint64_t(3));
00163 Token const* e = t+4;
00164 if (e > root->getCorpus()->sntEnd(sid))
00165 e = root->getCorpus()->sntEnd(sid);
00166 *log << docid << ":" << sid << " " << size_t(k) << "/" << N
00167 << " @" << p << " => " << d << " [";
00168 for (id_iter m = stats->indoc.begin(); m != stats->indoc.end(); ++m)
00169 {
00170 if (m != stats->indoc.begin()) *log << " ";
00171 *log << m->first << ":" << m->second;
00172 }
00173
00174
00175
00176
00177
00178 *log << "] ";
00179 for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
00180 if (!ret) *log << "SKIP";
00181 else if (p < .5 && d > .9) *log << "FORCE";
00182 *log << std::endl;
00183 }
00184
00185 return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0);
00186 }
00187
00188 template<typename Token>
00189 bool Bitext<Token>::agenda::job
00190 ::flip_coin(uint64_t & sid, uint64_t & offset)
00191 {
00192 int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1;
00193 if (no_maybe_yes == 0) return false;
00194 if (no_maybe_yes > 1) return true;
00195
00196 size_t options_chosen = stats->good;
00197 size_t options_total = std::max(stats->raw_cnt, this->ctr);
00198 size_t options_left = (options_total - this->ctr);
00199 size_t random_number = options_left * (rnd()/(rnd.max()+1.));
00200 size_t threshold;
00201 if (bias_total)
00202 threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples);
00203 else
00204 threshold = max_samples;
00205 return random_number + options_chosen < threshold;
00206 }
00207
00208 template<typename Token>
00209 bool Bitext<Token>::agenda::job
00210 ::step(uint64_t & sid, uint64_t & offset)
00211 {
00212 if (next == stop) return false;
00213 UTIL_THROW_IF2(next > stop, "Fatal error at " << HERE << ".");
00214 next = root->readSid(next, stop, sid);
00215 next = root->readOffset(next, stop, offset);
00216 ++ctr;
00217 return true;
00218 }
00219
00220 template<typename Token>
00221 bool Bitext<Token>::agenda::job
00222 ::nextSample(uint64_t & sid, uint64_t & offset)
00223 {
00224 boost::lock_guard<boost::mutex> jguard(lock);
00225 if (max_samples == 0)
00226 return step(sid, offset);
00227
00228 while (step(sid,offset))
00229 {
00230 size_t good = stats->good;
00231 size_t diversity = stats->trg.size();
00232 if (good >= max_samples && diversity >= min_diverse)
00233 return false;
00234
00235
00236
00237
00238 if (!flip_coin(sid,offset)) continue;
00239 return true;
00240 }
00241 return false;
00242 }
00243
00244 #if UG_BITEXT_TRACK_ACTIVE_THREADS
00245 template<typename TKN>
00246 ThreadSafeCounter Bitext<TKN>::agenda
00247 ::job
00248 ::active;
00249 #endif