00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef __ug_ttrack_base
00012 #define __ug_ttrack_base
00013
00014 #include <string>
00015 #include <vector>
00016
00017 #include <boost/dynamic_bitset.hpp>
00018
00019 #include "ug_ttrack_position.h"
00020 #include "tpt_typedefs.h"
00021 #include "tpt_tokenindex.h"
00022 #include "moses/Util.h"
00023
00024 namespace sapt
00025 {
00026
00027 typedef boost::dynamic_bitset<uint64_t> bdBitset;
00028 using tpt::count_type;
00029
00030 size_t len_from_pid(uint64_t pid);
00031
00032 template<typename sid_t, typename off_t, typename len_t>
00033 void
00034 parse_pid(uint64_t const pid, sid_t & sid,
00035 off_t & off, len_t& len)
00036 {
00037 static uint64_t two32 = uint64_t(1)<<32;
00038 static uint64_t two16 = uint64_t(1)<<16;
00039 len = pid%two16;
00040 off = (pid%two32)>>16;
00041 sid = pid>>32;
00042 }
00043
00044 template<typename Token>
00045 std::string
00046 toString(TokenIndex const& V, Token const* x, size_t const len)
00047 {
00048 if (!len) return "";
00049 UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
00050 std::ostringstream buf;
00051 buf << V[x->id()];
00052 size_t i = 1;
00053 for (x = x->next(); x && i < len; ++i, x = x->next())
00054 buf << " " << V[x->id()];
00055 UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
00056 return buf.str();
00057 }
00058
00059 template<typename TKN=id_type>
00060 class Ttrack
00061 {
00062 public:
00063
00064 virtual ~Ttrack() {};
00065 typedef typename ttrack::Position Position;
00066 typedef TKN Token;
00067
00069 virtual
00070 TKN const*
00071 sntStart(size_t sid) const = 0;
00072
00074 virtual
00075 TKN const*
00076 sntEnd(size_t sid) const = 0;
00077
00078 TKN const*
00079 getToken(Position const& p) const;
00080
00081 template<typename T>
00082 T const*
00083 getTokenAs(Position const& p) const
00084 { return reinterpret_cast<T const*>(getToken(p)); }
00085
00086 template<typename T>
00087 T const*
00088 sntStartAs(id_type sid) const
00089 { return reinterpret_cast<T const*>(sntStart(sid)); }
00090
00091 template<typename T>
00092 T const*
00093 sntEndAs(id_type sid) const
00094 { return reinterpret_cast<T const*>(sntEnd(sid)); }
00095
00097 size_t sntLen(size_t sid) const { return sntEnd(sid) - sntStart(sid); }
00098
00099 size_t
00100 startPos(id_type sid) const { return sntStart(sid)-sntStart(0); }
00101
00102 size_t
00103 endPos(id_type sid) const { return sntEnd(sid)-sntStart(0); }
00104
00106 std::vector<TKN>
00107 operator[](id_type sid) const
00108 {
00109 return std::vector<TKN>(sntStart(sid),sntEnd(sid));
00110 }
00111
00113 virtual size_t size() const = 0;
00114
00116 virtual size_t numTokens() const = 0;
00117
00120 std::string str(id_type sid, TokenIndex const& T) const;
00121
00122 std::string pid2str(TokenIndex const* V, uint64_t pid) const;
00123
00124
00125
00126
00127
00130 count_type count_tokens(std::vector<count_type>& cnt, bdBitset const* filter,
00131 int lengthCutoff=0, std::ostream* log=NULL) const;
00132
00133
00134
00135 int cmp(Position const& A, Position const& B, int keyLength) const;
00136 int cmp(Position const& A, TKN const* keyStart, int keyLength=-1,
00137 int depth=0) const;
00138
00139 virtual id_type findSid(TKN const* t) const = 0;
00140
00141
00142
00143
00144 TKN const*
00145 find_next_within_sentence(TKN const* startKey,
00146 int keyLength,
00147 Position startHere) const;
00148
00149 Position
00150 find_first(TKN const* startKey, int keyLength,
00151 bdBitset const* filter=NULL) const;
00152
00153 Position
00154 find_next(TKN const* startKey, int keyLength, Position startAfter,
00155 bdBitset const* filter=NULL) const;
00156
00157
00158 virtual size_t offset(TKN const* t) const { return t-sntStart(0); }
00159 };
00160
00161
00162
00163 template<typename TKN>
00164 TKN const*
00165 Ttrack<TKN>::
00166 getToken(Position const& p) const
00167 {
00168 TKN const* ret = sntStart(p.sid)+p.offset;
00169 return (ret < sntEnd(p.sid)) ? ret : NULL;
00170 }
00171
00172
00173
00174 template<typename TKN>
00175 count_type
00176 Ttrack<TKN>::
00177 count_tokens(std::vector<count_type>& cnt, bdBitset const* filter,
00178 int lengthCutoff, std::ostream* log) const
00179 {
00180 bdBitset filter2;
00181 if (!filter)
00182 {
00183 filter2.resize(this->size());
00184 filter2.set();
00185 filter = &filter2;
00186 }
00187 cnt.clear();
00188 cnt.reserve(500000);
00189 count_type totalCount=0;
00190
00191 int64_t expectedTotal=0;
00192 for (size_t sid = 0; sid < this->size(); ++sid)
00193 expectedTotal += this->sntLen(sid);
00194
00195 for (size_t sid = filter->find_first();
00196 sid < filter->size();
00197 sid = filter->find_next(sid))
00198 {
00199 TKN const* k = sntStart(sid);
00200 TKN const* const stop = sntEnd(sid);
00201 if (lengthCutoff && stop-k >= lengthCutoff)
00202 {
00203 if (log)
00204 *log << "WARNING: skipping sentence #" << sid
00205 << " with more than 65536 tokens" << std::endl;
00206 expectedTotal -= stop-k;
00207 }
00208 else
00209 {
00210 totalCount += stop-k;
00211 for (; k < stop; ++k)
00212 {
00213
00214 id_type wid = k->id();
00215 while (wid >= cnt.size()) cnt.push_back(0);
00216 cnt[wid]++;
00217 }
00218 }
00219 }
00220 if (this->size() == filter->count())
00221 {
00222 if (totalCount != expectedTotal)
00223 std::cerr << "OOPS: expected " << expectedTotal
00224 << " tokens but counted " << totalCount << std::endl;
00225 assert(totalCount == expectedTotal);
00226 }
00227 return totalCount;
00228 }
00229
00230 template<typename TKN>
00231 int
00232 Ttrack<TKN>::
00233 cmp(Position const& A, Position const& B, int keyLength) const
00234 {
00235 if (keyLength==0) return 2;
00236 assert(A.sid < this->size());
00237 assert(B.sid < this->size());
00238
00239 TKN const* a = getToken(A);
00240 TKN const* bosA = sntStart(A.sid);
00241 TKN const* eosA = sntEnd(A.sid);
00242
00243 TKN const* b = getToken(B);
00244 TKN const* bosB = sntStart(B.sid);
00245 TKN const* eosB = sntEnd(B.sid);
00246
00247 int ret=-1;
00248
00249 #if 0
00250 cerr << "A: "; for (TKN const* x = a; x; x = next(x)) cerr << x->lemma << " "; cerr << std::endl;
00251 cerr << "B: "; for (TKN const* x = b; x; x = next(x)) cerr << x->lemma << " "; cerr << std::endl;
00252 #endif
00253
00254 while (a >= bosA && a < eosA)
00255 {
00256
00257 if (*a < *b) { break; }
00258 if (*a > *b) { ret = 2; break; }
00259 a = next(a);
00260 b = next(b);
00261
00262 if (--keyLength==0 || b < bosB || b >= eosB)
00263 {
00264 ret = (a < bosA || a >= eosA) ? 0 : 1;
00265 break;
00266 }
00267 }
00268
00269 return ret;
00270 }
00271
00272 template<typename TKN>
00273 int
00274 Ttrack<TKN>::
00275 cmp(Position const& A, TKN const* key, int keyLength, int depth) const
00276 {
00277 if (keyLength==0 || !key) return 2;
00278 assert(A.sid < this->size());
00279 TKN const* x = getToken(A);
00280 TKN const* stopx = x->stop(*this,A.sid);
00281 for (int i = 0; i < depth; ++i)
00282 {
00283 x = x->next();
00284 if (x == stopx) return -1;
00285
00286 }
00287 while (x != stopx)
00288 {
00289 if (*x < *key) return -1;
00290 if (*x > *key) return 2;
00291 key = key->next();
00292 x = x->next();
00293 if (--keyLength==0)
00294 return (x == stopx) ? 0 : 1;
00295 assert(key);
00296 }
00297 return -1;
00298 }
00299
00300 template<typename TKN>
00301 TKN const*
00302 Ttrack<TKN>::
00303 find_next_within_sentence(TKN const* startKey, int keyLength,
00304 Position startHere) const
00305 {
00306 for (TKN const* t = getToken(startHere); t; t = getToken(startHere))
00307 {
00308 #if 0
00309 int foo = cmp(startHere,startKey,1);
00310 if (foo == 0 || foo ==1)
00311 {
00312 TKN const* k = startKey->next();
00313 TKN const* t2 = t->next();
00314 if (t2)
00315 {
00316 cout << t2->lemma << "." << int(t2->minpos) << " "
00317 << k->lemma << "." << int(k->minpos) << " "
00318 << t2->cmp(*k) << std::endl;
00319 }
00320 }
00321 #endif
00322 int x = cmp(startHere,startKey,keyLength,0);
00323 if (x == 0 || x == 1) return t;
00324 startHere.offset++;
00325 }
00326 return NULL;
00327 }
00328
00329 template<typename TKN>
00330 typename Ttrack<TKN>::Position
00331 Ttrack<TKN>::
00332 find_first(TKN const* startKey, int keyLength, bdBitset const* filter) const
00333 {
00334 if (filter)
00335 {
00336 for (size_t sid = filter->find_first();
00337 sid < filter->size();
00338 sid = filter->find_next(sid))
00339 {
00340 TKN const* x = find_next_within_sentence(startKey,keyLength,Position(sid,0));
00341 if (x) return Position(sid,x-sntStart(sid));
00342 }
00343 }
00344 else
00345 {
00346 for (size_t sid = 0; sid < this->size(); ++sid)
00347 {
00348 TKN const* x = find_next_within_sentence(startKey,keyLength,Position(sid,0));
00349 if (x) return Position(sid,x-sntStart(sid));
00350 }
00351 }
00352 return Position(this->size(),0);
00353 }
00354
00355 template<typename TKN>
00356 typename Ttrack<TKN>::Position
00357 Ttrack<TKN>::
00358 find_next(TKN const* startKey, int keyLength, Position startAfter, bdBitset const* filter) const
00359 {
00360 id_type sid = startAfter.sid;
00361 startAfter.offset++;
00362 if (filter) assert(filter->test(sid));
00363 TKN const* x = find_next_within_sentence(startKey,keyLength,startAfter);
00364 if (x) return Position(sid,x -sntStart(sid));
00365 if (filter)
00366 {
00367 for (sid = filter->find_next(sid); sid < filter->size(); sid = filter->find_next(sid))
00368 {
00369 x = find_next_within_sentence(startKey,keyLength,Position(sid,0));
00370 if (x) break;
00371 }
00372 }
00373 else
00374 {
00375 for (++sid; sid < this->size(); sid++)
00376 {
00377 x = find_next_within_sentence(startKey,keyLength,Position(sid,0));
00378 if (x) break;
00379 }
00380 }
00381 if (x)
00382 return Position(sid,x-sntStart(sid));
00383 else
00384 return Position(this->size(),0);
00385 }
00386
00387 template<typename TKN>
00388 std::string
00389 Ttrack<TKN>::
00390 pid2str(TokenIndex const* V, uint64_t pid) const
00391 {
00392 uint32_t len = pid % (1<<16);
00393 pid >>= 16;
00394 uint32_t off = pid % (1<<16);
00395 uint32_t sid = pid>>16;
00396 std::ostringstream buf;
00397 TKN const* t = sntStart(sid) + off;
00398 TKN const* stop = t + len;
00399 if (V)
00400 {
00401 while (t < stop)
00402 {
00403 buf << (*V)[t->id()];
00404 if ((t = t->next()) != stop) buf << " ";
00405 }
00406 }
00407 else
00408 {
00409 while (t < stop)
00410 {
00411 buf << t->id();
00412 if ((t = t->next()) != stop) buf << " ";
00413 }
00414 }
00415 return buf.str();
00416 }
00417
00418 }
00419 #endif