00001
00002
00003 #ifndef _ug_im_tsa_h
00004 #define _ug_im_tsa_h
00005
00006
00007
00008
00009 #include <iostream>
00010
00011 #include <boost/iostreams/device/mapped_file.hpp>
00012 #include <boost/shared_ptr.hpp>
00013 #include <boost/dynamic_bitset.hpp>
00014 #include <boost/foreach.hpp>
00015 #include <boost/thread.hpp>
00016
00017 #include "tpt_tightindex.h"
00018 #include "tpt_tokenindex.h"
00019 #include "ug_tsa_base.h"
00020 #include "tpt_pickler.h"
00021
00022 #include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h"
00023 #include "util/usage.hh"
00024
00025 namespace sapt
00026 {
00027 namespace bio=boost::iostreams;
00028
00029 template<typename TOKEN, typename SORTER>
00030 class TsaSorter
00031 {
00032 public:
00033 typedef typename Ttrack<TOKEN>::Position cpos;
00034 typedef typename std::vector<cpos>::iterator iter;
00035 private:
00036 SORTER m_sorter;
00037 iter m_begin;
00038 iter m_end;
00039 public:
00040 TsaSorter(SORTER sorter, iter& begin, iter& end)
00041 : m_sorter(sorter),
00042 m_begin(begin),
00043 m_end(end) { }
00044
00045 bool
00046 operator()()
00047 {
00048 std::sort(m_begin, m_end, m_sorter);
00049 return true;
00050 }
00051
00052 };
00053
00054
00055
00056 template<typename TOKEN>
00057 class imTSA : public TSA<TOKEN>
00058 {
00059 typedef typename Ttrack<TOKEN>::Position cpos;
00060
00061 public:
00062 class tree_iterator;
00063 friend class tree_iterator;
00064
00065 private:
00066 std::vector<cpos> sufa;
00067 std::vector<filepos_type> index;
00068
00069 private:
00070 char const*
00071 index_jump(char const* a, char const* z, float ratio) const;
00072
00073 char const*
00074 getLowerBound(id_type id) const;
00075
00076 char const*
00077 getUpperBound(id_type id) const;
00078
00079 public:
00080 imTSA();
00081 imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c, bdBitset const* filt,
00082 std::ostream* log = NULL, size_t threads = 0);
00083
00084 imTSA(imTSA<TOKEN> const& prior,
00085 boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
00086 std::vector<id_type> const& newsids, size_t const vsize);
00087
00088 count_type
00089 sntCnt(char const* p, char const * const q) const;
00090
00091 count_type
00092 rawCnt(char const* p, char const * const q) const;
00093
00094 void
00095 getCounts(char const* p, char const * const q,
00096 count_type& sids, count_type& raw) const;
00097
00098 char const*
00099 readSid(char const* p, char const* q, id_type& sid) const;
00100
00101 char const*
00102 readSid(char const* p, char const* q, ::uint64_t& sid) const;
00103
00104 char const*
00105 readOffset(char const* p, char const* q, uint16_t& offset) const;
00106
00107 char const*
00108 readOffset(char const* p, char const* q, ::uint64_t& offset) const;
00109
00110 void
00111 sanityCheck() const;
00112
00113 void
00114 save_as_mm_tsa(std::string fname) const;
00115
00117
00118
00119 };
00120
00121 template<typename TOKEN>
00122 class
00123 imTSA<TOKEN>::
00124 tree_iterator : public TSA<TOKEN>::tree_iterator
00125 {
00126 public:
00127 tree_iterator(imTSA<TOKEN> const* s);
00128 };
00129
00130 template<typename TOKEN>
00131 imTSA<TOKEN>::
00132 tree_iterator::
00133 tree_iterator(imTSA<TOKEN> const* s)
00134 : TSA<TOKEN>::tree_iterator::tree_iterator(reinterpret_cast<TSA<TOKEN> const*>(s))
00135 {};
00136
00140 template<typename TOKEN>
00141 char const*
00142 imTSA<TOKEN>::
00143 index_jump(char const* a, char const* z, float ratio) const
00144 {
00145 typedef cpos cpos;
00146 assert(ratio >= 0 && ratio < 1);
00147 cpos const* xa = reinterpret_cast<cpos const*>(a);
00148 cpos const* xz = reinterpret_cast<cpos const*>(z);
00149 return reinterpret_cast<char const*>(xa+int(ratio*(xz-xa)));
00150 }
00151
00152 template<typename TOKEN>
00153 imTSA<TOKEN>::
00154 imTSA()
00155 {
00156 this->indexSize = 0;
00157
00158 this->startArray = NULL;
00159 this->endArray = NULL;
00160 this->corpusSize = 0;
00161 this->BitSetCachingThreshold=4096;
00162 };
00163
00164
00165
00166 template<typename TOKEN>
00167 imTSA<TOKEN>::
00168 imTSA(boost::shared_ptr<Ttrack<TOKEN> const> c,
00169 bdBitset const* filter, std::ostream* log, size_t threads)
00170 {
00171 if (threads == 0)
00172 threads = boost::thread::hardware_concurrency();
00173 assert(c);
00174 this->corpus = c;
00175 bdBitset filter2;
00176 if (!filter)
00177 {
00178 filter2.resize(c->size());
00179 filter2.set();
00180 filter = &filter2;
00181 }
00182 assert(filter);
00183
00184
00185
00186
00187
00188
00189
00190 if (log) *log << "counting tokens ... ";
00191 int slimit = 65536;
00192
00193
00194
00195
00196
00197 std::vector<count_type> wcnt;
00198 sufa.resize(c->count_tokens(wcnt,filter,slimit,log));
00199
00200 if (log) *log << sufa.size() << "." << std::endl;
00201
00202
00203
00204 std::vector<count_type> tmp(wcnt.size(),0);
00205 for (size_t i = 1; i < wcnt.size(); ++i)
00206 tmp[i] = tmp[i-1] + wcnt[i-1];
00207
00208
00209 this->corpusSize = 0;
00210 for (id_type sid = filter->find_first();
00211 sid < filter->size();
00212 sid = filter->find_next(sid))
00213 {
00214 TOKEN const* k = c->sntStart(sid);
00215 TOKEN const* const stop = c->sntEnd(sid);
00216 if (stop - k >= slimit) continue;
00217 this->corpusSize++;
00218 for (ushort p=0; k < stop; ++p,++k)
00219 {
00220 id_type wid = k->id();
00221 cpos& cpos = sufa[tmp[wid]++];
00222 cpos.sid = sid;
00223 cpos.offset = p;
00224 assert(p < c->sntLen(sid));
00225 }
00226 }
00227
00228
00229 if (log) *log << "sorting .... with " << threads << " threads." << std::endl;
00230 #ifndef NO_MOSES
00231 double start_time = util::WallTime();
00232 #endif
00233 boost::scoped_ptr<ug::ThreadPool> tpool;
00234 tpool.reset(new ug::ThreadPool(threads));
00235
00236 index.resize(wcnt.size()+1,0);
00237 typedef typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter_t;
00238 sorter_t sorter(c.get());
00239 for (size_t i = 0; i < wcnt.size(); i++)
00240 {
00241
00242
00243
00244 index[i+1] = index[i]+wcnt[i];
00245 assert(index[i+1]==tmp[i]);
00246 if (wcnt[i]>1)
00247 {
00248 typename std::vector<cpos>::iterator b,e;
00249 b = sufa.begin()+index[i];
00250 e = sufa.begin()+index[i+1];
00251 TsaSorter<TOKEN,sorter_t> foo(sorter,b,e);
00252 tpool->add(foo);
00253
00254 }
00255 }
00256 tpool.reset();
00257 #ifndef NO_MOSES
00258 if (log) *log << "Done sorting after " << util::WallTime() - start_time
00259 << " seconds." << std::endl;
00260 #endif
00261 this->startArray = reinterpret_cast<char const*>(&(*sufa.begin()));
00262 this->endArray = reinterpret_cast<char const*>(&(*sufa.end()));
00263 this->numTokens = sufa.size();
00264 this->indexSize = this->index.size();
00265 #if 1
00266
00267 typename std::vector<cpos>::iterator m = sufa.begin();
00268 for (size_t i = 0; i < wcnt.size(); i++)
00269 {
00270 for (size_t k = 0; k < wcnt[i]; ++k,++m)
00271 {
00272 assert(c->getToken(*m)->id()==i);
00273 assert(m->offset < c->sntLen(m->sid));
00274 }
00275 }
00276 #endif
00277 }
00278
00279
00280
00281 template<typename TOKEN>
00282 char const*
00283 imTSA<TOKEN>::
00284 getLowerBound(id_type id) const
00285 {
00286 if (id >= this->index.size())
00287 return NULL;
00288 assert(index[id] <= this->sufa.size());
00289 return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
00290 }
00291
00292 template<typename TOKEN>
00293 char const*
00294 imTSA<TOKEN>::
00295 getUpperBound(id_type id) const
00296 {
00297 if (++id >= this->index.size())
00298 return NULL;
00299 assert(index[id] <= this->sufa.size());
00300 return reinterpret_cast<char const*>(&(this->sufa.front()) + index[id]);
00301 }
00302
00303 template<typename TOKEN>
00304 char const*
00305 imTSA<TOKEN>::
00306 readSid(char const* p, char const* q, id_type& sid) const
00307 {
00308 assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
00309 assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
00310 sid = reinterpret_cast<cpos const*>(p)->sid;
00311 return p;
00312 }
00313
00314 template<typename TOKEN>
00315 char const*
00316 imTSA<TOKEN>::
00317 readSid(char const* p, char const* q, ::uint64_t& sid) const
00318 {
00319 assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
00320 assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
00321 sid = reinterpret_cast<cpos const*>(p)->sid;
00322 return p;
00323 }
00324
00325 template<typename TOKEN>
00326 char const*
00327 imTSA<TOKEN>::
00328 readOffset(char const* p, char const* q, uint16_t& offset) const
00329 {
00330 assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
00331 assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
00332 offset = reinterpret_cast<cpos const*>(p)->offset;
00333 return p+sizeof(cpos);
00334 }
00335
00336 template<typename TOKEN>
00337 char const*
00338 imTSA<TOKEN>::
00339 readOffset(char const* p, char const* q, ::uint64_t& offset) const
00340 {
00341 assert(reinterpret_cast<cpos const*>(p) >= &(this->sufa.front()));
00342 assert(reinterpret_cast<cpos const*>(p) <= &(this->sufa.back()));
00343 offset = reinterpret_cast<cpos const*>(p)->offset;
00344 return p+sizeof(cpos);
00345 }
00346
00347 template<typename TOKEN>
00348 count_type
00349 imTSA<TOKEN>::
00350 rawCnt(char const* p, char const* const q) const
00351 {
00352 cpos const* xp = reinterpret_cast<cpos const*>(p);
00353 cpos const* xq = reinterpret_cast<cpos const*>(q);
00354 return xq-xp;
00355 }
00356
00357 template<typename TOKEN>
00358 void
00359 imTSA<TOKEN>::
00360 getCounts(char const* p, char const* const q,
00361 count_type& sids, count_type& raw) const
00362 {
00363 id_type sid;
00364 bdBitset check(this->corpus->size());
00365 cpos const* xp = reinterpret_cast<cpos const*>(p);
00366 cpos const* xq = reinterpret_cast<cpos const*>(q);
00367 raw = xq-xp;
00368 for (;xp < xq;xp++)
00369 {
00370 sid = xp->sid;
00371
00372 check.set(sid);
00373 }
00374 sids = check.count();
00375 }
00376
00377 template<typename TOKEN>
00378 void
00379 imTSA<TOKEN>::
00380 save_as_mm_tsa(std::string fname) const
00381 {
00382 std::ofstream out(fname.c_str());
00383 filepos_type idxStart(0);
00384 id_type idxSize(index.size());
00385 tpt::numwrite(out,idxStart);
00386 tpt::numwrite(out,idxSize);
00387 std::vector<filepos_type> mmIndex;
00388 for (size_t i = 1; i < this->index.size(); i++)
00389 {
00390 mmIndex.push_back(out.tellp());
00391 for (size_t k = this->index[i-1]; k < this->index[i]; ++k)
00392 {
00393 tpt::tightwrite(out,sufa[k].sid,0);
00394 tpt::tightwrite(out,sufa[k].offset,1);
00395 }
00396 }
00397 mmIndex.push_back(out.tellp());
00398 idxStart = out.tellp();
00399 for (size_t i = 0; i < mmIndex.size(); i++)
00400 tpt::numwrite(out,mmIndex[i]-mmIndex[0]);
00401 out.seekp(0);
00402 tpt::numwrite(out,idxStart);
00403 out.close();
00404 }
00405
00406 template<typename TOKEN>
00407 imTSA<TOKEN>::
00408 imTSA(imTSA<TOKEN> const& prior,
00409 boost::shared_ptr<imTtrack<TOKEN> const> const& crp,
00410 std::vector<id_type> const& newsids, size_t const vsize)
00411 {
00412 typename ttrack::Position::LESS<Ttrack<TOKEN> > sorter(crp.get());
00413
00414
00415
00416 size_t newToks = 0;
00417 BOOST_FOREACH(id_type sid, newsids)
00418 newToks += crp->sntLen(sid);
00419 std::vector<cpos> nidx(newToks);
00420
00421 size_t n = 0;
00422 BOOST_FOREACH(id_type sid, newsids)
00423 {
00424 assert(sid < crp->size());
00425 for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n)
00426 { nidx[n].offset = o; nidx[n].sid = sid; }
00427 }
00428 sort(nidx.begin(),nidx.end(),sorter);
00429
00430
00431 this->numTokens = newToks + prior.sufa.size();
00432 this->sufa.resize(this->numTokens);
00433 this->startArray = reinterpret_cast<char const*>(&(*this->sufa.begin()));
00434 this->endArray = reinterpret_cast<char const*>(&(*this->sufa.end()));
00435 this->corpusSize = crp->size();
00436 this->corpus = crp;
00437 this->index.resize(vsize+1);
00438
00439 size_t i = 0;
00440 typename std::vector<cpos>::iterator k = this->sufa.begin();
00441
00442
00443 for (size_t n = 0; n < nidx.size();)
00444 {
00445 id_type nid = crp->getToken(nidx[n])->id();
00446 assert(nid >= i);
00447 while (i < nid)
00448 {
00449 this->index[i] = k - this->sufa.begin();
00450 if (++i < prior.index.size() && prior.index[i-1] < prior.index[i])
00451 {
00452 k = copy(prior.sufa.begin() + prior.index[i-1],
00453 prior.sufa.begin() + prior.index[i], k);
00454 }
00455 }
00456 this->index[i] = k - this->sufa.begin();
00457 if (++i < prior.index.size() && prior.index[i] > prior.index[i-1])
00458 {
00459 size_t j = prior.index[i-1];
00460 while (j < prior.index[i] && n < nidx.size()
00461 && crp->getToken(nidx[n])->id() < i)
00462 {
00463 assert(k < this->sufa.end());
00464 if (sorter(prior.sufa[j],nidx[n]))
00465 *k++ = prior.sufa[j++];
00466 else
00467 *k++ = nidx[n++];
00468 }
00469 while (j < prior.index[i])
00470 {
00471 assert(k < this->sufa.end());
00472 *k++ = prior.sufa[j++];
00473 }
00474 }
00475 while (n < nidx.size() && this->corpus->getToken(nidx[n])->id() < i)
00476 {
00477 assert(k < this->sufa.end());
00478 *k++ = nidx[n++];
00479 }
00480 this->index[i] = k - this->sufa.begin();
00481 }
00482 this->index[i] = k - this->sufa.begin();
00483 while (++i < this->index.size())
00484 {
00485 if (i < prior.index.size() && prior.index[i-1] < prior.index[i])
00486 k = copy(prior.sufa.begin() + prior.index[i-1],
00487 prior.sufa.begin() + prior.index[i], k);
00488 this->index[i] = k - this->sufa.begin();
00489 }
00490 #if 0
00491
00492 assert(this->sufa.size() == this->index.back());
00493 BOOST_FOREACH(cpos const& x, this->sufa)
00494 {
00495 assert(x.sid < this->corpusSize);
00496 assert(x.offset < this->corpus->sntLen(x.sid));
00497 }
00498 for (size_t i = 1; i < index.size(); ++i)
00499 {
00500 assert(index[i-1] <= index[i]);
00501 assert(index[i] <= sufa.size());
00502 for (size_t k = index[i-1]; k < index[i]; ++k)
00503 assert(this->corpus->getToken(sufa[k])->id() == i-1);
00504 }
00505 assert(index[0] == 0);
00506 assert(this->startArray == reinterpret_cast<char const*>(&(*this->sufa.begin())));
00507 assert(this->endArray == reinterpret_cast<char const*>(&(*this->sufa.end())));
00508 #endif
00509 }
00510
00511 }
00512
00513 #endif