00001
00002
00003
00004
00005
00006
00007 #ifndef moses_PrefixTree_h
00008 #define moses_PrefixTree_h
00009
00010 #include <vector>
00011 #include <algorithm>
00012 #include <deque>
00013 #include "Util.h"
00014 #include "FilePtr.h"
00015 #include "File.h"
00016
00017 namespace Moses
00018 {
00019
00022 template<typename T,typename D>
00023 class PrefixTreeSA
00024 {
00025 public:
00026 typedef T Key;
00027 typedef D Data;
00028
00029 typedef PrefixTreeSA<T,D> Self;
00030 typedef std::vector<T> VT;
00031 typedef std::vector<Self*> VP;
00032 typedef std::vector<D> VD;
00033
00034 VT keys;
00035 VP ptr;
00036 VD data;
00037
00038 static Data def;
00039
00040 public:
00041 PrefixTreeSA() {}
00042
00043 ~PrefixTreeSA() {
00044 for(size_t i=0; i<ptr.size(); ++i) delete ptr[i];
00045 }
00046
00047 static const Data& getDefault() {
00048 return def;
00049 }
00050 static void setDefault(const Data& x) {
00051 def=x;
00052 }
00053
00054
00055
00056 template<typename fwiter> Data& insert(fwiter b,fwiter e) {
00057 typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
00058 typename VT::iterator kb=keys.begin();
00059 size_t pos=std::distance(kb,i);
00060
00061 if(i==keys.end() || *i!=*b) {
00062 keys.insert(i,*b);
00063 data.insert(data.begin()+pos,def);
00064
00065 Self *self = NULL;
00066 ptr.insert(ptr.begin()+pos, self);
00067 }
00068 if(++b!=e) {
00069 if(!ptr[pos]) ptr[pos]=new Self;
00070 return ptr[pos]->insert(b,e);
00071 } else return data[pos];
00072 }
00073
00074 template<typename cont> Data& insert(const cont& c) {
00075 return insert(c.begin(),c.end());
00076 }
00077
00078 size_t size() const {
00079 return keys.size();
00080 }
00081 const Key& getKey(size_t i) const {
00082 return keys[i];
00083 }
00084 const Data& getData(size_t i) const {
00085 return data[i];
00086 }
00087 const Self* getPtr(size_t i) const {
00088 return ptr[i];
00089 }
00090
00091 size_t findKey(const Key& k) const {
00092 typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
00093 if(i==keys.end() || *i!=k) return keys.size();
00094 return std::distance(keys.begin(),i);
00095 }
00096
00097
00098 template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
00099 size_t pos=findKey(*b);
00100 if(pos==keys.size()) return 0;
00101 if(++b==e) return &data[pos];
00102 if(ptr[pos]) return ptr[pos]->findPtr(b,e);
00103 else return 0;
00104 }
00105
00106 template<typename cont> const Data* findPtr(const cont& c) const {
00107 return findPtr(c.begin(),c.end());
00108 }
00109
00110
00111
00112 template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
00113 if(const Data* p=findPtr(b,e)) return *p;
00114 else return def;
00115 }
00116
00117
00118 template<typename cont> const Data& find(const cont& c) const {
00119 return find(c.begin(),c.end());
00120 }
00121
00122 void shrink() {
00123 ShrinkToFit(keys);
00124 ShrinkToFit(ptr);
00125 ShrinkToFit(data);
00126 }
00127
00128 };
00129 template<typename T,typename D> D PrefixTreeSA<T,D>::def;
00130
00132
00135 template<typename T,typename D>
00136 class PrefixTreeF
00137 {
00138 public:
00139 typedef T Key;
00140 typedef D Data;
00141 private:
00142 typedef PrefixTreeF<Key,Data> Self;
00143 public:
00144 typedef FilePtr<Self> Ptr;
00145 private:
00146 typedef std::vector<Key> VK;
00147 typedef std::vector<Data> VD;
00148 typedef std::vector<Ptr> VP;
00149
00150 VK keys;
00151 VD data;
00152 VP ptr;
00153
00154 static Data def;
00155
00156 OFF_T startPos;
00157 FILE* f;
00158 public:
00159
00160 PrefixTreeF(FILE* f_=0) : f(f_) {
00161 if(f) read();
00162 }
00163
00164 ~PrefixTreeF() {
00165 free();
00166 }
00167
00168 void read() {
00169 startPos=fTell(f);
00170 fReadVector(f,keys);
00171 fReadVector(f,data);
00172 ptr.clear();
00173 ptr.resize(keys.size());
00174 std::vector<OFF_T> rawOffs(keys.size());
00175 size_t bytes_read = fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
00176 UTIL_THROW_IF2(bytes_read != keys.size(), "Read error at " << HERE);
00177 for(size_t i=0; i<ptr.size(); ++i)
00178 if (rawOffs[i]) ptr[i].set(f, rawOffs[i]);
00179 }
00180
00181 void free() {
00182 for(typename VP::iterator i=ptr.begin(); i!=ptr.end(); ++i) i->free();
00183 }
00184
00185 void reserve(size_t s) {
00186 keys.reserve(s);
00187 data.reserve(s);
00188 ptr.reserve(s);
00189 }
00190
00191 template<typename fwiter>
00192 void changeData(fwiter b,fwiter e,const Data& d) {
00193 typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
00194 if(i==keys.end() || *i!=*b) {
00195 TRACE_ERR("ERROR: key not found in changeData!\n");
00196 return;
00197 }
00198 typename VK::const_iterator kb=keys.begin();
00199 size_t pos=std::distance(kb,i);
00200 if(++b==e) {
00201 OFF_T p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
00202 TRACE_ERR("elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n");
00203 if(data[pos]!=d) {
00204 data[pos]=d;
00205 fSeek(f,p);
00206 fWrite(f,d);
00207 }
00208 return;
00209 }
00210 if(ptr[pos]) ptr[pos]->changeData(b,e,d);
00211 else {
00212 TRACE_ERR("ERROR: seg not found!in changeData\n");
00213 }
00214 }
00215
00216
00217 void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
00218 FILE* f=fOpen(fname.c_str(),"wb");
00219 create(psa,f);
00220 fclose(f);
00221 }
00222
00223 void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
00224 setDefault(psa.getDefault());
00225
00226 typedef std::pair<const PrefixTreeSA<Key,Data>*,OFF_T> P;
00227 typedef std::deque<P> Queue;
00228
00229 Queue queue;
00230
00231 queue.push_back(P(&psa,fTell(f)));
00232 bool isFirst=1;
00233 size_t ns=1;
00234 while(queue.size()) {
00235 if(verbose && queue.size()>ns) {
00236 TRACE_ERR("stack size in PF create: "<<queue.size()<<"\n");
00237 while(ns<queue.size()) ns*=2;
00238 }
00239 const P& pp=queue.back();
00240 const PrefixTreeSA<Key,Data>& p=*pp.first;
00241 OFF_T pos=pp.second;
00242 queue.pop_back();
00243
00244 if(!isFirst) {
00245 OFF_T curr=fTell(f);
00246 fSeek(f,pos);
00247 fWrite(f,curr);
00248 fSeek(f,curr);
00249 } else isFirst=0;
00250
00251 size_t s=0;
00252 s+=fWriteVector(f,p.keys);
00253 s+=fWriteVector(f,p.data);
00254
00255 for(size_t i=0; i<p.ptr.size(); ++i) {
00256 if(p.ptr[i])
00257 queue.push_back(P(p.ptr[i],fTell(f)));
00258 OFF_T ppos=0;
00259 s+=fWrite(f,ppos);
00260 }
00261 }
00262 }
00263
00264 size_t size() const {
00265 return keys.size();
00266 }
00267 const Key& getKey(size_t i) const {
00268 return keys[i];
00269 }
00270 const Data& getData(size_t i) const {
00271 return data[i];
00272 }
00273 const Self* getPtr(size_t i) const {
00274 return ptr[i];
00275 }
00276
00277 size_t findKey(const Key& k) const {
00278 typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
00279 if(i==keys.end() || *i!=k) return keys.size();
00280 return std::distance(keys.begin(),i);
00281 }
00282
00283 Ptr const* findKeyPtr(const Key& k) const {
00284 size_t pos=findKey(k);
00285 return (pos<keys.size() ? &ptr[pos] : 0);
00286 }
00287
00288
00289 template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
00290 typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
00291 if(i==keys.end() || *i!=*b) return 0;
00292 size_t pos=std::distance(keys.begin(),i);
00293 if(++b==e) return &data[pos];
00294 if(ptr[pos]) return ptr[pos]->findPtr(b,e);
00295 else return 0;
00296 }
00297
00298 template<typename cont> const Data* findPtr(const cont& c) const {
00299 return findPtr(c.begin(),c.end());
00300 }
00301
00302
00303
00304 template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
00305 if(const Data* p=findPtr(b,e)) return *p;
00306 else return def;
00307 }
00308
00309
00310 template<typename cont> const Data& find(const cont& c) const {
00311 return find(c.begin(),c.end());
00312 }
00313
00314 static void setDefault(const Data& d) {
00315 def=d;
00316 }
00317 static const Data& getDefault() {
00318 return def;
00319 }
00320
00321
00322 void print(std::ostream& out,const std::string s="") const {
00323
00324 out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
00325 for(size_t i=0; i<keys.size(); ++i) {
00326 out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
00327 }
00328 for(size_t i=0; i<ptr.size(); ++i)
00329 if(ptr[i])
00330 ptr[i]->print(out,s+" ");
00331 }
00332
00333
00334 };
00335 template<typename T,typename D> D PrefixTreeF<T,D>::def;
00336
00337 }
00338
00339 #endif