00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef MF_LMTABLE_H
00025 #define MF_LMTABLE_H
00026
00027 #ifndef WIN32
00028 #include <sys/types.h>
00029 #include <sys/mman.h>
00030 #endif
00031
00032 #include <math.h>
00033 #include <cstdlib>
00034 #include <string>
00035 #include <set>
00036 #include <limits>
00037 #include "util.h"
00038 #include "ngramcache.h"
00039 #include "dictionary.h"
00040 #include "n_gram.h"
00041 #include "lmContainer.h"
00042
00043 #define MAX(a,b) (((a)>(b))?(a):(b))
00044 #define MIN(a,b) (((a)<(b))?(a):(b))
00045
00046 #define LMTMAXLEV 20
00047 #define MAX_LINE 100000
00048
00049 #ifndef LMTCODESIZE
00050 #define LMTCODESIZE (int)3
00051 #endif
00052
00053 #define SHORTSIZE (int)2
00054 #define PTRSIZE (int)sizeof(char *)
00055 #define INTSIZE (int)4
00056 #define CHARSIZE (int)1
00057
00058 #define PROBSIZE (int)4 //use float
00059 #define QPROBSIZE (int)1 //use qfloat_t
00060
00061 #define BOUNDSIZE (int)sizeof(table_entry_pos_t) //use table_pos_t
00062
00063 #define UNIGRAM_RESOLUTION 10000000.0
00064
00065 typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE;
00066
00067 typedef char* node;
00068
00069 typedef enum {LMT_FIND,
00070 LMT_ENTER,
00071 LMT_INIT,
00072 LMT_CONT
00073 } LMT_ACTION;
00074
00075 typedef unsigned int table_entry_pos_t;
00076 typedef unsigned long table_pos_t;
00077 typedef unsigned char qfloat_t;
00078
00079
00080
00081 #define BOUND_EMPTY1 (numeric_limits<table_entry_pos_t>::max() - 2)
00082 #define BOUND_EMPTY2 (numeric_limits<table_entry_pos_t>::max() - 1)
00083
00084 class lmtable: public lmContainer
00085 {
00086 static const bool debug=true;
00087
00088 void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap);
00089 void loadtxt_ram(std::istream& inp,const char* header);
00090 void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename);
00091 void loadtxt_level(std::istream& inp,int l);
00092
00093 void loadbin(std::istream& inp,const char* header,const char* filename,int mmap);
00094 void loadbin_header(std::istream& inp, const char* header);
00095 void loadbin_dict(std::istream& inp);
00096 void loadbin_codebook(std::istream& inp,int l);
00097 void loadbin_level(std::istream& inp,int l);
00098
00099 protected:
00100 char* table[LMTMAXLEV+1];
00101 LMT_TYPE tbltype[LMTMAXLEV+1];
00102 table_entry_pos_t cursize[LMTMAXLEV+1];
00103
00104
00105
00106
00107 table_entry_pos_t tb_offset[LMTMAXLEV+1];
00108
00109 table_entry_pos_t maxsize[LMTMAXLEV+1];
00110 table_entry_pos_t* startpos[LMTMAXLEV+1];
00111 char info[100];
00112
00113
00114 int totget[LMTMAXLEV+1];
00115 int totbsearch[LMTMAXLEV+1];
00116
00117
00118 bool isQtable;
00119
00120
00121 bool isItable;
00122
00123
00124 bool isInverted;
00125
00126
00127 bool isPruned;
00128
00129 int NumCenters[LMTMAXLEV+1];
00130 float* Pcenters[LMTMAXLEV+1];
00131 float* Bcenters[LMTMAXLEV+1];
00132
00133 double logOOVpenalty;
00134 int dictionary_upperbound;
00135 int backoff_state;
00136
00137
00138 int max_cache_lev;
00139
00140 NGRAMCACHE_t* prob_and_state_cache;
00141 NGRAMCACHE_t* lmtcache[LMTMAXLEV+1];
00142 float ngramcache_load_factor;
00143 float dictionary_load_factor;
00144
00145
00146 int memmap;
00147 int diskid;
00148 off_t tableOffs[LMTMAXLEV+1];
00149 off_t tableGaps[LMTMAXLEV+1];
00150
00151
00152
00153 bool orderQuery;
00154
00155
00156 bool delete_dict;
00157
00158 public:
00159
00160 #ifdef TRACE_CACHELM
00161 std::fstream* cacheout;
00162 int sentence_id;
00163 #endif
00164
00165 dictionary *dict;
00166
00167 lmtable(float nlf=0.0, float dlfi=0.0);
00168
00169 virtual ~lmtable();
00170
00171 table_entry_pos_t wdprune(float *thr, int aflag=0);
00172 table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0);
00173 double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0);
00174
00175 table_entry_pos_t ngcnt(table_entry_pos_t *cnt);
00176 table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos);
00177 int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s);
00178
00179 void init_prob_and_state_cache();
00180 void init_probcache() {
00181 init_prob_and_state_cache();
00182 };
00183 void init_statecache() {};
00184 void init_lmtcaches(int uptolev);
00185 void init_caches(int uptolev);
00186
00187 void used_prob_and_state_cache();
00188 void used_lmtcaches();
00189 void used_caches();
00190
00191
00192 void delete_prob_and_state_cache();
00193 void delete_probcache() {
00194 delete_prob_and_state_cache();
00195 };
00196 void delete_statecache() {};
00197 void delete_lmtcaches();
00198 void delete_caches();
00199
00200 void check_prob_and_state_cache_levels();
00201 void check_probcache_levels() {
00202 check_prob_and_state_cache_levels();
00203 };
00204 void check_statecache_levels() {};
00205 void check_lmtcaches_levels();
00206 void check_caches_levels();
00207
00208 void reset_prob_and_state_cache();
00209 void reset_probcache() {
00210 reset_prob_and_state_cache();
00211 };
00212 void reset_statecache() {};
00213 void reset_lmtcaches();
00214 void reset_caches();
00215
00216
00217 bool are_prob_and_state_cache_active();
00218 bool is_probcache_active() {
00219 return are_prob_and_state_cache_active();
00220 };
00221 bool is_statecache_active() {
00222 return are_prob_and_state_cache_active();
00223 };
00224 bool are_lmtcaches_active();
00225 bool are_caches_active();
00226
00227 void reset_mmap();
00228
00229
00230
00231
00232 bool is_inverted(const bool flag) {
00233 return isInverted=flag;
00234 }
00235 bool is_inverted() {
00236 return isInverted;
00237 }
00238
00239 void configure(int n,bool quantized);
00240
00241
00242 double getlogOOVpenalty() const {
00243 return logOOVpenalty;
00244 }
00245
00246 double setlogOOVpenalty(int dub) {
00247 assert(dub > dict->size());
00248 dictionary_upperbound = dub;
00249 return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10;
00250 }
00251
00252 double setlogOOVpenalty(double oovp) {
00253 return logOOVpenalty=oovp;
00254 }
00255
00256 virtual int maxlevel() const {
00257 return maxlev;
00258 };
00259 bool isQuantized() const {
00260 return isQtable;
00261 }
00262
00263
00264 void savetxt(const char *filename);
00265 void savebin(const char *filename);
00266
00267 void appendbin_level(int level, fstream &out, int mmap);
00268 void appendbin_level_nommap(int level, fstream &out);
00269 void appendbin_level_mmap(int level, fstream &out);
00270
00271 void savebin_level(int level, const char* filename, int mmap);
00272 void savebin_level_nommap(int level, const char* filename);
00273 void savebin_level_mmap(int level, const char* filename);
00274 void savebin_dict(std::fstream& out);
00275
00276 void compact_all_levels(const char* filename);
00277 void compact_single_level(int level, const char* filename);
00278
00279 void concatenate_all_levels(const char* fromfilename, const char* tofilename);
00280 void concatenate_single_level(int level, const char* fromfilename, const char* tofilename);
00281
00282 void remove_all_levels(const char* filename);
00283 void remove_single_level(int level, const char* filename);
00284
00285 void print_table_stat();
00286 void print_table_stat(int level);
00287
00288 void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos);
00289
00290
00291 void delete_level(int level, const char* outfilename, int mmap);
00292 void delete_level_nommap(int level);
00293 void delete_level_mmap(int level, const char* filename);
00294
00295 void resize_level(int level, const char* outfilename, int mmap);
00296 void resize_level_nommap(int level);
00297 void resize_level_mmap(int level, const char* filename);
00298
00299 inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; };
00300
00301
00302 void load(const std::string filename, int mmap=0);
00303 void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0,OUTFILE_TYPE outtype=NONE);
00304
00305 void load_centers(std::istream& inp,int l);
00306
00307 void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap);
00308 void expand_level_nommap(int level, table_entry_pos_t size);
00309 void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename);
00310
00311 void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true);
00312
00313 int reload(std::set<string> words);
00314
00315 void filter(const char* ) {};
00316
00317
00318 virtual double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
00319 virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
00320 virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
00321
00322
00323 void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL);
00324
00325 int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx);
00326
00327
00328 int add(ngram& ng, float prob,float bow);
00329
00330
00331 int addwithoffset(ngram& ng, float prob,float bow);
00332
00333
00334 void checkbounds(int level);
00335
00336 inline int get(ngram& ng) {
00337 return get(ng,ng.size,ng.size);
00338 }
00339 int get(ngram& ng,int n,int lev);
00340
00341 int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
00342
00343 virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
00344 virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
00345
00346 inline void putmem(char* ptr,int value,int offs,int size) {
00347 assert(ptr!=NULL);
00348 for (int i=0; i<size; i++)
00349 ptr[offs+i]=(value >> (8 * i)) & 0xff;
00350 };
00351
00352 inline void getmem(char* ptr,int* value,int offs,int size) {
00353 assert(ptr!=NULL);
00354 *value=ptr[offs] & 0xff;
00355 for (int i=1; i<size; i++){
00356 *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
00357 }
00358 };
00359
00360 template<typename T>
00361 inline void putmem(char* ptr,T value,int offs) {
00362 assert(ptr!=NULL);
00363 memcpy(ptr+offs, &value, sizeof(T));
00364 };
00365
00366 template<typename T>
00367 inline void getmem(char* ptr,T* value,int offs) {
00368 assert(ptr!=NULL);
00369 memcpy((void*)value, ptr+offs, sizeof(T));
00370 };
00371
00372
00373 int nodesize(LMT_TYPE ndt) {
00374 switch (ndt) {
00375 case INTERNAL:
00376 return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
00377 case QINTERNAL:
00378 return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
00379 case LEAF:
00380 return LMTCODESIZE + PROBSIZE;
00381 case QLEAF:
00382 return LMTCODESIZE + QPROBSIZE;
00383 default:
00384 assert(0);
00385 return 0;
00386 }
00387 }
00388
00389 inline int word(node nd,int value=-1) {
00390 int offset=0;
00391
00392 if (value==-1)
00393 getmem(nd,&value,offset,LMTCODESIZE);
00394 else
00395 putmem(nd,value,offset,LMTCODESIZE);
00396
00397 return value;
00398 };
00399
00400
00401 int codecmp(node a,node b) {
00402 register int i,result;
00403 for (i=(LMTCODESIZE-1); i>=0; i--) {
00404 result=(unsigned char)a[i]-(unsigned char)b[i];
00405 if(result) return result;
00406 }
00407 return 0;
00408 };
00409
00410 int codediff(node a,node b) {
00411 return word(a)-word(b);
00412 };
00413
00414
00415 inline float prob(node nd,LMT_TYPE ndt) {
00416 int offs=LMTCODESIZE;
00417
00418 float fv;
00419 unsigned char cv;
00420 switch (ndt) {
00421 case INTERNAL:
00422 getmem(nd,&fv,offs);
00423 return fv;
00424 case QINTERNAL:
00425 getmem(nd,&cv,offs);
00426 return (float) cv;
00427 case LEAF:
00428 getmem(nd,&fv,offs);
00429 return fv;
00430 case QLEAF:
00431 getmem(nd,&cv,offs);
00432 return (float) cv;
00433 default:
00434 assert(0);
00435 return 0;
00436 }
00437 };
00438
00439 template<typename T>
00440 inline T prob(node nd, LMT_TYPE ndt, T value) {
00441 int offs=LMTCODESIZE;
00442
00443 switch (ndt) {
00444 case INTERNAL:
00445 putmem(nd, value,offs);
00446 break;
00447 case QINTERNAL:
00448 putmem(nd,(unsigned char) value,offs);
00449 break;
00450 case LEAF:
00451 putmem(nd, value,offs);
00452 break;
00453 case QLEAF:
00454 putmem(nd,(unsigned char) value,offs);
00455 break;
00456 default:
00457 assert(0);
00458 return (T) 0;
00459 }
00460
00461 return value;
00462 };
00463
00464
00465 inline float bow(node nd,LMT_TYPE ndt) {
00466 int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00467
00468 float fv;
00469 unsigned char cv;
00470 switch (ndt) {
00471 case INTERNAL:
00472 getmem(nd,&fv,offs);
00473 return fv;
00474 case QINTERNAL:
00475 getmem(nd,&cv,offs);
00476 return (float) cv;
00477 case LEAF:
00478 getmem(nd,&fv,offs);
00479 return fv;
00480 case QLEAF:
00481 getmem(nd,&cv,offs);
00482 return (float) cv;
00483 default:
00484 assert(0);
00485 return 0;
00486 }
00487 };
00488
00489 template<typename T>
00490 inline T bow(node nd,LMT_TYPE ndt, T value) {
00491 int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00492
00493 switch (ndt) {
00494 case INTERNAL:
00495 putmem(nd, value,offs);
00496 break;
00497 case QINTERNAL:
00498 putmem(nd,(unsigned char) value,offs);
00499 break;
00500 case LEAF:
00501 putmem(nd, value,offs);
00502 break;
00503 case QLEAF:
00504 putmem(nd,(unsigned char) value,offs);
00505 break;
00506 default:
00507 assert(0);
00508 return 0;
00509 }
00510
00511 return value;
00512 };
00513
00514
00515 inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; }
00516
00517 inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); }
00518
00519
00520 table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
00521
00522 int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00523
00524 table_entry_pos_t value;
00525
00526 getmem(nd,&value,offs);
00527
00528
00529
00530 return value;
00531 };
00532
00533
00534
00535 table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) {
00536
00537 int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
00538
00539
00540
00541 putmem(nd,value,offs);
00542
00543 return value;
00544 };
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595 int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL);
00596
00597 void stat(int lev=0);
00598 void printTable(int level);
00599
00600 virtual inline void setDict(dictionary* d) {
00601 if (delete_dict==true && dict) delete dict;
00602 dict=d;
00603 delete_dict=false;
00604 };
00605
00606 virtual inline dictionary* getDict() const {
00607 return dict;
00608 };
00609
00610 inline table_entry_pos_t getCurrentSize(int l) const {
00611 return cursize[l];
00612 };
00613
00614 inline void setOrderQuery(bool v) {
00615 orderQuery = v;
00616 }
00617 inline bool isOrderQuery() const {
00618 return orderQuery;
00619 }
00620
00621 inline float GetNgramcacheLoadFactor() {
00622 return ngramcache_load_factor;
00623 }
00624 inline float GetDictioanryLoadFactor() {
00625 return ngramcache_load_factor;
00626 }
00627
00628
00629 inline virtual void dictionary_incflag(const bool flag) {
00630 UNUSED(flag);
00631 };
00632
00633 inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) {
00634 std::cerr << "filtering... \n";
00635 dictionary *dict=new dictionary((char *)sfilter.c_str());
00636
00637 cpsublm(sublmt, dict,(skeepunigrams=="yes"));
00638 delete dict;
00639 std::cerr << "...done\n";
00640 return true;
00641 }
00642
00643 inline virtual bool is_OOV(int code) {
00644 return (code == dict->oovcode());
00645 };
00646
00647 };
00648
00649 #endif
00650