00001
00002
00003
00004 #ifndef _ug_tsa_base_h
00005 #define _ug_tsa_base_h
00006
00007 #include <iostream>
00008 #include <string>
00009
00010 #include <boost/iostreams/device/mapped_file.hpp>
00011 #include <boost/shared_ptr.hpp>
00012 #include "tpt_tokenindex.h"
00013 #include "ug_ttrack_base.h"
00014 #include "ug_im_ttrack.h"
00015 #include "ug_corpus_token.h"
00016 #include "ug_tsa_tree_iterator.h"
00017 #include "ug_tsa_array_entry.h"
00018 #include "ug_tsa_bitset_cache.h"
00019 #include "ug_typedefs.h"
00020
00021 namespace sapt
00022 {
00023
00024 namespace bio=boost::iostreams;
00025
00026 template<typename TKN>
00027 TKN const*
00028 next(TKN const* x)
00029 {
00030 return static_cast<TKN const*>(x ? x->next() : NULL);
00031 }
00032
00043 template<typename TKN>
00044 class TSA
00045 {
00046 public:
00047 virtual ~TSA() {};
00048 typedef TSA_tree_iterator<TKN> tree_iterator;
00049
00050 typedef tsa::ArrayEntry ArrayEntry;
00051
00052
00053
00054 typedef boost::shared_ptr<bitvector> bitset_pointer;
00055 typedef TKN Token;
00056 typedef BitSetCache<TSA<TKN> > BSC_t;
00057
00058
00059
00060 friend class TSA_tree_iterator<TKN>;
00061
00062 protected:
00063 boost::shared_ptr<Ttrack<TKN> const> corpus;
00064 char const* startArray;
00065 char const* endArray;
00066
00067
00068 size_t corpusSize;
00078 id_type numTokens;
00087 id_type indexSize;
00088
00089
00090 size_t BitSetCachingThreshold;
00091
00093
00094
00098 virtual
00099 char const*
00100 index_jump(char const* startRange,
00101 char const* stopRange,
00102 float fraction) const = 0;
00103
00107 char const*
00108 find_start(char const* lo, char const* const upX,
00109 TKN const* const refStart, int refLen,
00110 size_t d) const;
00111
00115 char const*
00116 find_end(char const* lo, char const* const upX,
00117 TKN const* const refStart, int refLen,
00118 size_t d) const;
00119
00123 char const*
00124 find_longer(char const* lo, char const* const upX,
00125 TKN const* const refStart, int refLen,
00126 size_t d) const;
00127
00131 virtual
00132 char const*
00133 getLowerBound(id_type id) const = 0;
00134
00135 virtual
00136 char const*
00137 getUpperBound(id_type id) const = 0;
00138
00139 public:
00140 boost::shared_ptr<BSC_t> bsc;
00141
00142 char const* arrayStart() const { return startArray; }
00143 char const* arrayEnd() const { return endArray; }
00144
00148 char const*
00149 lower_bound(typename std::vector<TKN>::const_iterator const& keyStart,
00150 typename std::vector<TKN>::const_iterator const& keyStop) const;
00151 char const*
00152 lower_bound(TKN const* keyStart, TKN const* keyStop) const;
00153
00154 char const*
00155 lower_bound(TKN const* keyStart, int keyLen) const;
00156
00160 char const*
00161 upper_bound(typename std::vector<TKN>::const_iterator const& keyStart,
00162 typename std::vector<TKN>::const_iterator const& keyStop) const;
00163
00164 char const*
00165 upper_bound(TKN const* keyStart, int keyLength) const;
00166
00167
00169 void dump(std::ostream& out, TokenIndex const& T) const;
00170
00175 count_type
00176 fillBitSet(std::vector<TKN> const& phrase, bdBitset& dest) const;
00177
00178 count_type
00179 fillBitSet(TKN const* key, size_t keyLen, bdBitset& dest) const;
00180
00181 count_type
00182 setBits(char const* startRange, char const* endRange,
00183 boost::dynamic_bitset<uint64_t>& bs) const;
00184
00185 void
00186 setTokenBits(char const* startRange, char const* endRange, size_t len,
00187 bitvector& bs) const;
00188
00196 virtual
00197 char const*
00198 readSid(char const* p, char const* q, id_type& sid) const = 0;
00199
00200 virtual
00201 char const*
00202 readSid(char const* p, char const* q, ::uint64_t& sid) const = 0;
00203
00211 virtual
00212 char const*
00213 readOffset(char const* p, char const* q, uint16_t& offset) const = 0;
00214
00215 virtual
00216 char const*
00217 readOffset(char const* p, char const* q, ::uint64_t& offset) const = 0;
00218
00221 count_type
00222 sntCnt(char const* p, char const* const q) const;
00223
00224 count_type
00225 rawCnt2(TKN const* keyStart, size_t keyLen) const;
00226
00232 virtual
00233 count_type
00234 rawCnt(char const* p, char const* const q) const = 0;
00235
00242 virtual
00243 void
00244 getCounts(char const* p, char const* const q,
00245 count_type& sids, count_type& raw) const = 0;
00246
00247 std::string
00248 suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0)
00249 const;
00250
00251 std::string
00252 suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0)
00253 const;
00254
00255 tsa::ArrayEntry& readEntry(char const* p, tsa::ArrayEntry& I) const;
00256
00258 char const* dataEnd() const;
00259
00260 bool sanityCheck1() const;
00261
00269 ::uint64_t
00270 getSequenceId(typename std::vector<TKN>::const_iterator const& pstart,
00271 typename std::vector<TKN>::const_iterator const& pstop) const;
00272
00273 ::uint64_t
00274 getSequenceId(TKN const* t, ushort plen) const;
00275
00277 std::string
00278 getSequence(::uint64_t pid, TokenIndex const& V) const;
00279
00281 std::vector<TKN>
00282 getSequence(::uint64_t pid) const;
00283
00284 TKN const*
00285 getSequenceStart(::uint64_t) const;
00286
00287 ushort
00288 getSequenceLength(::uint64_t) const;
00289
00290 size_t
00291 getCorpusSize() const;
00292
00293 Ttrack<TKN> const*
00294 getCorpus() const;
00295
00296 bitset_pointer
00297 getBitSet(TKN const* startKey, size_t keyLen) const;
00298
00299 boost::shared_ptr<bitvector>
00300 findTree(TKN const* treeStart, TKN const* treeEnd,
00301 bitvector const* filter) const;
00302
00303 size_t markOccurrences(char const* lo, char const* up, size_t len,
00304 bitvector& bitset,
00305 bool markOnlyStartPosition) const;
00306
00307 bool
00308 findBranches(TKN const* base, bitvector const& terminals,
00309 std::vector<tree_iterator>& dest) const;
00310
00311 double aveIndexEntrySize() const
00312 {
00313 return (endArray-startArray)/double(numTokens);
00314 }
00315
00316 public:
00317
00318 SPTR<TSA_tree_iterator<TKN> >
00319 find(TKN const* start, size_t len) const
00320 {
00321 typedef TSA_tree_iterator<TKN> iter;
00322 SPTR<iter> ret(new iter(this));
00323 size_t i = 0;
00324 while (i < len && ret->extend(start[i])) ++i;
00325 if (i < len) ret.reset();
00326 return ret;
00327 }
00328
00329 };
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00354 template<typename TKN>
00355 count_type
00356 TSA<TKN>::
00357 fillBitSet(std::vector<TKN> const& key,
00358 bitvector& bitset) const
00359 {
00360 if (!key.size()) return 0;
00361 return fillBitset(&(key[0]),key.size(),bitset);
00362 }
00363
00364
00365
00370 template<typename TKN>
00371 count_type
00372 TSA<TKN>::
00373 fillBitSet(TKN const* key, size_t keyLen,
00374 bitvector& bitset) const
00375 {
00376 char const* lo = lower_bound(key,keyLen);
00377 char const* up = upper_bound(key,keyLen);
00378 bitset.resize(corpus->size());
00379 bitset.reset();
00380 return setBits(lo,up,bitset);
00381 }
00382
00383
00384
00385 template<typename TKN>
00386 count_type
00387 TSA<TKN>::
00388 setBits(char const* startRange, char const* endRange,
00389 bitvector& bs) const
00390 {
00391 count_type wcount=0;
00392 char const* p = startRange;
00393 id_type sid;
00394 ushort off;
00395 while (p < endRange)
00396 {
00397 p = readSid(p,endRange,sid);
00398 p = readOffset(p,endRange,off);
00399 bs.set(sid);
00400 wcount++;
00401 }
00402 return wcount;
00403 }
00404
00405
00406
00407 template<typename TKN>
00408 void
00409 TSA<TKN>::
00410 setTokenBits(char const* startRange, char const* endRange, size_t len,
00411 bitvector& bs) const
00412 {
00413 ArrayEntry I;
00414 I.next = startRange;
00415 do {
00416 readEntry(I.next,I);
00417 Token const* t = corpus->getToken(I);
00418 Token const* stop = t->stop(*corpus,I.sid);
00419 for (size_t i = 1; i < len; ++i)
00420 {
00421 assert(t != stop);
00422 t = t->next();
00423 }
00424 assert(t != stop);
00425 bs.set(t - corpus->sntStart(0));
00426 } while (I.next != endRange);
00427 }
00428
00429
00430
00431 template<typename TKN>
00432 count_type
00433 TSA<TKN>::
00434 sntCnt(char const* p, char const* const q) const
00435 {
00436 id_type sid; uint16_t off;
00437 bitvector check(corpus->size());
00438 while (p < q)
00439 {
00440 p = readSid(p,q,sid);
00441 p = readOffset(p,q,off);
00442 check.set(sid);
00443 }
00444 return check.count();
00445 }
00446
00447
00448
00452 template<typename TKN>
00453 char const*
00454 TSA<TKN>::
00455 find_start(char const* lo, char const* const upX,
00456 TKN const* const refStart, int refLen,
00457 size_t d) const
00458 {
00459 char const* up = upX;
00460 if (lo >= up) return NULL;
00461 int x;
00462 ArrayEntry I;
00463 while (lo < up)
00464 {
00465 readEntry(index_jump(lo,up,.5),I);
00466 x = corpus->cmp(I,refStart,refLen,d);
00467 if (x >= 0) up = I.pos;
00468 else lo = I.next;
00469 }
00470 assert(lo==up);
00471 if (lo < upX)
00472 {
00473 readEntry(lo,I);
00474 x = corpus->cmp(I,refStart,refLen,d);
00475 }
00476
00477 return (x == 0 || x == 1) ? lo : NULL;
00478 }
00479
00480
00481
00485 template<typename TKN>
00486 char const*
00487 TSA<TKN>::
00488 find_end(char const* lo, char const* const upX,
00489 TKN const* const refStart, int refLen,
00490 size_t d) const
00491
00492 {
00493 char const* up = upX;
00494 if (lo >= up) return NULL;
00495 int x;
00496 ArrayEntry I;
00497
00498 while (lo < up)
00499 {
00500 readEntry(index_jump(lo,up,.1),I);
00501 x = corpus->cmp(I,refStart,refLen,d);
00502 if (x == 2) up = I.pos;
00503 else lo = I.next;
00504
00505 }
00506 assert(lo==up);
00507 if (lo < upX)
00508 {
00509 readEntry(lo,I);
00510 x = corpus->cmp(I,refStart,refLen,d);
00511 }
00512 return (x == 2) ? up : upX;
00513 }
00514
00515
00516
00520 template<typename TKN>
00521 char const*
00522 TSA<TKN>::
00523 find_longer(char const* lo, char const* const upX,
00524 TKN const* const refStart, int refLen,
00525 size_t d) const
00526 {
00527 char const* up = upX;
00528 if (lo >= up) return NULL;
00529 int x;
00530 ArrayEntry I;
00531 while (lo < up)
00532 {
00533 readEntry(index_jump(lo,up,.5),I);
00534 x = corpus->cmp(I,refStart,refLen,d);
00535 if (x > 0) up = I.pos;
00536 else lo = I.next;
00537 }
00538 assert(lo==up);
00539 if (lo < upX)
00540 {
00541 readEntry(index_jump(lo,up,.5),I);
00542 x = corpus->cmp(I,refStart,refLen,d);
00543 }
00544 return (x == 1) ? up : NULL;
00545 }
00546
00547
00548
00553 template<typename TKN>
00554 char const*
00555 TSA<TKN>::
00556 lower_bound(typename std::vector<TKN>::const_iterator const& keyStart,
00557 typename std::vector<TKN>::const_iterator const& keyStop) const
00558 {
00559 TKN const* const a = &(*keyStart);
00560 TKN const* const z = &(*keyStop);
00561 return lower_bound(a,z);
00562 }
00563
00564
00565
00570 template<typename TKN>
00571 char const*
00572 TSA<TKN>::
00573 lower_bound(TKN const* const keyStart,
00574 TKN const* const keyStop) const
00575 {
00576 return lower_bound(keyStart,keyStop-keyStart);
00577 }
00578
00579 template<typename TKN>
00580 char const*
00581 TSA<TKN>::
00582 lower_bound(TKN const* const keyStart, int keyLen) const
00583 {
00584 if (keyLen == 0) return startArray;
00585 char const* const lower = getLowerBound(keyStart->id());
00586 char const* const upper = getUpperBound(keyStart->id());
00587 return find_start(lower,upper,keyStart,keyLen,0);
00588 }
00589
00590
00595 template<typename TKN>
00596 char const*
00597 TSA<TKN>::
00598 upper_bound(typename std::vector<TKN>::const_iterator const& keyStart,
00599 typename std::vector<TKN>::const_iterator const& keyStop) const
00600 {
00601 TKN const* const a = &((TKN)*keyStart);
00602 TKN const* const z = &((TKN)*keyStop);
00603 return upper_bound(a,z-a);
00604 }
00605
00606
00607
00612 template<typename TKN>
00613 char const*
00614 TSA<TKN>::
00615 upper_bound(TKN const* keyStart, int keyLength) const
00616 {
00617 if (keyLength == 0) return arrayEnd();
00618 char const* const lower = getLowerBound(keyStart->id());
00619 char const* const upper = getUpperBound(keyStart->id());
00620 return find_end(lower,upper,keyStart,keyLength,0);
00621 }
00622
00623
00624
00625 template<typename TKN>
00626 count_type
00627 TSA<TKN>::
00628 rawCnt2(TKN const* keyStart, size_t keyLen) const
00629 {
00630 char const* lo = lower_bound(keyStart,keyLen);
00631 char const* up = upper_bound(keyStart,keyLen);
00632
00633 return rawCnt(lo,up);
00634 }
00635
00636
00637
00638 template<typename TKN>
00639 ::uint64_t
00640 TSA<TKN>::
00641 getSequenceId(typename std::vector<TKN>::const_iterator const& pstart,
00642 typename std::vector<TKN>::const_iterator const& pstop) const
00643 {
00644 return getSequenceId(&(*pstart),pstop-pstart);
00645 }
00646
00647
00648
00649 template<typename TKN>
00650 ::uint64_t
00651 TSA<TKN>::
00652 getSequenceId(TKN const* pstart, ushort plen) const
00653 {
00654 char const* p = lower_bound(pstart,plen);
00655 if (!p) return 0;
00656 ArrayEntry I;
00657 readEntry(p,I);
00658 ::uint64_t ret = I.sid;
00659 ret <<= 16;
00660 ret += I.offset;
00661 ret <<= 16;
00662 ret += plen;
00663 return ret;
00664 }
00665
00666
00667
00668 template<typename TKN>
00669 std::vector<TKN>
00670 TSA<TKN>::
00671 getSequence(::uint64_t pid) const
00672 {
00673 size_t plen = pid % 65536;
00674 size_t offset = (pid >> 16) % 65536;
00675 TKN const* w = corpus->sntStart(pid >> 32)+offset;
00676 std::vector<TKN> ret(plen);
00677 for (size_t i = 0; i < plen; i++, w = w->next())
00678 {
00679 assert(w);
00680 ret[i] = *w;
00681 }
00682 return ret;
00683 }
00684
00685 template<typename TKN>
00686 std::string
00687 TSA<TKN>::
00688 getSequence(::uint64_t pid, TokenIndex const& V) const
00689 {
00690 std::ostringstream buf;
00691 TKN const* a = getSequenceStart(pid);
00692 buf << V[a->id()];
00693 size_t len = getSequenceLength(pid);
00694 for (a = a->next(); --len>0; a = a->next())
00695 buf << " " << V[a->id()];
00696 return buf.str();
00697 }
00698
00699
00700
00701
00702 template<typename TKN>
00703 TKN const*
00704 TSA<TKN>::
00705 getSequenceStart(::uint64_t pid) const
00706 {
00707 size_t offset = (pid >> 16) % 65536;
00708 return corpus->sntStart(pid >> 32)+offset;
00709 }
00710
00711
00712
00713 template<typename TKN>
00714 ushort
00715 TSA<TKN>::
00716 getSequenceLength(::uint64_t pid) const
00717 {
00718 return (pid % 65536);
00719 }
00720
00721
00722
00723 template<typename TKN>
00724 size_t
00725 TSA<TKN>::
00726 getCorpusSize() const
00727 {
00728 return corpusSize;
00729 }
00730
00731
00732
00733 template<typename TKN>
00734 Ttrack<TKN> const*
00735 TSA<TKN>::
00736 getCorpus() const
00737 {
00738 return corpus.get();
00739 }
00740
00741
00742
00743 template<typename TKN>
00744 tsa::ArrayEntry &
00745 TSA<TKN>::
00746 readEntry(char const* p, tsa::ArrayEntry& I) const
00747 {
00748 I.pos = p;
00749 p = readSid(p,endArray,I.sid);
00750 I.next = readOffset(p,endArray,I.offset);
00751 assert(I.sid < corpus->size());
00752 assert(I.offset < corpus->sntLen(I.sid));
00753 return I;
00754 };
00755
00756
00757
00759 template<typename TKN>
00760 typename TSA<TKN>::bitset_pointer
00761 TSA<TKN>::
00762 getBitSet(TKN const* startKey, size_t keyLen) const
00763 {
00764 bitset_pointer ret;
00765 if (bsc != NULL)
00766 ret = bsc->get(startKey,keyLen);
00767 else
00768 {
00769 ret.reset(new bitvector(corpus->size()));
00770 fillBitSet(startKey,keyLen,*ret);
00771 }
00772 return ret;
00773 }
00774
00775
00776
00777 template<typename TKN>
00778 size_t
00779 TSA<TKN>::
00780 markOccurrences(char const* lo, char const* up, size_t len,
00781 bitvector& bitset, bool markOnlyStartPosition) const
00782 {
00783 id_type sid;
00784 ushort off;
00785 count_type wcount=0;
00786 TKN const* crpStart = corpus->sntStart(0);
00787 char const* p = lo;
00788 while (p < up)
00789 {
00790 p = readSid(p,up,sid);
00791 p = readOffset(p,up,off);
00792 TKN const* t = corpus->sntStart(sid)+off;
00793 if (markOnlyStartPosition)
00794 bitset.set(t-crpStart);
00795 else
00796 for (size_t i = 0; i < len; ++i, t = t->next())
00797 bitset.set(t-crpStart);
00798 wcount++;
00799 }
00800 return wcount;
00801 }
00802 #if 1
00803 template<typename TKN>
00804 bool
00805 TSA<TKN>::
00806 findBranches(TKN const* base, bitvector const& terminals,
00807 std::vector<tree_iterator>& dest) const
00808 {
00809 dest.assign(terminals.count(),tree_iterator(this));
00810 for (size_t i = terminals.find_first(), k = 0;
00811 i < terminals.size();
00812 i = terminals.find_next(i),++k)
00813 {
00814 for (TKN const* x = base+i; x && x->id(); x = x->next())
00815 if (!dest[k].extend(x->id()))
00816 return false;
00817 }
00818 typename tree_iterator::SortByApproximateCount sorter;
00819 sort(dest.begin(),dest.end(),sorter);
00820 return true;
00821 }
00822 #endif
00823
00824 }
00825 #endif