00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef MF_NGRAMTABLE_H
00024 #define MF_NGRAMTABLE_H
00025
00026
00027 #ifndef BACKOFF_
00028 #define BACKOFF_ "_backoff_"
00029 #endif
00030
00031
00032 #ifndef DUMMY_
00033 #define DUMMY_ "_dummy_"
00034 #endif
00035
00036
00037
00038 #ifdef MYCODESIZE
00039 #define DEFCODESIZE MYCODESIZE
00040 #else
00041 #define DEFCODESIZE (int)2
00042 #endif
00043
00044 #define SHORTSIZE (int)2
00045 #define PTRSIZE (int)sizeof(char *)
00046 #define INTSIZE (int)4
00047 #define CHARSIZE (int)1
00048
00049
00050
00051 #define FREQ1 (unsigned char) 1
00052 #define FREQ2 (unsigned char) 2
00053 #define FREQ4 (unsigned char) 4
00054 #define INODE (unsigned char) 8
00055 #define LNODE (unsigned char) 16
00056 #define SNODE (unsigned char) 32
00057 #define FREQ6 (unsigned char) 64
00058 #define FREQ3 (unsigned char) 128
00059
00060 typedef char* node;
00061 typedef char* table;
00062
00063 typedef unsigned char NODETYPE;
00064
00065
00066 typedef enum {FIND,
00067 ENTER,
00068 DELETE,
00069 INIT,
00070 CONT
00071 } ACTION;
00072
00073
00074 typedef enum {COUNT,
00075 LEAFPROB,
00076 FLEAFPROB,
00077 LEAFPROB2,
00078 LEAFPROB3,
00079 LEAFPROB4,
00080 LEAFCODE,
00081 SIMPLE_I,
00082 SIMPLE_B,
00083 SHIFTBETA_I,
00084 SHIFTBETA_B,
00085 MSHIFTBETA_I,
00086 MSHIFTBETA_B,
00087 FULL,
00088
00089 } TABLETYPE;
00090
00091
00092
00093 class tabletype
00094 {
00095
00096 TABLETYPE ttype;
00097
00098 public:
00099
00100 int CODESIZE;
00101 long long code_range[7];
00102
00103
00104 int WORD_OFFS;
00105 int MSUCC_OFFS;
00106 int MTAB_OFFS;
00107 int FLAGS_OFFS;
00108 int SUCC1_OFFS;
00109 int SUCC2_OFFS;
00110 int BOFF_OFFS;
00111 int I_FREQ_OFFS;
00112 int I_FREQ_NUM;
00113 int L_FREQ_NUM;
00114 int L_FREQ_SIZE;
00115
00116
00117 int L_FREQ_OFFS;
00118
00119 TABLETYPE tbtype() {
00120 return ttype;
00121 }
00122
00123 tabletype(TABLETYPE tt,int codesize=DEFCODESIZE) {
00124
00125 if (codesize<=4 && codesize>0)
00126 CODESIZE=codesize;
00127 else {
00128 cerr << "ngramtable wrong codesize\n";
00129 exit(1);
00130 }
00131
00132 code_range[1]=255;
00133 code_range[2]=65535;
00134 code_range[3]=16777214;
00135 code_range[4]=2147483640;
00136 code_range[6]=140737488360000LL;
00137
00138
00139
00140
00141 L_FREQ_SIZE=FREQ1;
00142
00143 WORD_OFFS =0;
00144 MSUCC_OFFS =CODESIZE;
00145 MTAB_OFFS =MSUCC_OFFS+CODESIZE;
00146 FLAGS_OFFS =MTAB_OFFS+PTRSIZE;
00147
00148 switch (tt) {
00149
00150 case COUNT:
00151 SUCC1_OFFS =0;
00152 SUCC2_OFFS =0;
00153 BOFF_OFFS =0;
00154 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00155 I_FREQ_NUM=1;
00156 L_FREQ_NUM=1;
00157
00158 ttype=tt;
00159 break;
00160
00161 case FULL:
00162 case MSHIFTBETA_B:
00163 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE;
00164 SUCC2_OFFS =SUCC1_OFFS+CODESIZE;
00165 BOFF_OFFS =SUCC2_OFFS+CODESIZE;
00166 I_FREQ_OFFS=BOFF_OFFS+INTSIZE;
00167 L_FREQ_OFFS=CODESIZE;
00168 I_FREQ_NUM=2;
00169 L_FREQ_NUM=1;
00170
00171 ttype=tt;
00172 break;
00173
00174 case MSHIFTBETA_I:
00175 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE;
00176 SUCC2_OFFS =SUCC1_OFFS+CODESIZE;
00177 BOFF_OFFS =0;
00178 I_FREQ_OFFS=SUCC2_OFFS+CODESIZE;
00179 L_FREQ_OFFS=CODESIZE;
00180 I_FREQ_NUM=2;
00181 L_FREQ_NUM=1;
00182
00183 ttype=tt;
00184 break;
00185
00186 case SIMPLE_I:
00187 SUCC1_OFFS = 0;
00188 SUCC2_OFFS = 0;
00189 BOFF_OFFS = 0;
00190 I_FREQ_OFFS= FLAGS_OFFS+CHARSIZE;
00191 L_FREQ_OFFS=CODESIZE;
00192 I_FREQ_NUM=1;
00193 L_FREQ_NUM=1;
00194
00195 ttype=tt;
00196 break;
00197
00198 case SIMPLE_B:
00199
00200 SUCC1_OFFS = 0;
00201 SUCC2_OFFS = 0;
00202 BOFF_OFFS = FLAGS_OFFS+CHARSIZE;
00203 I_FREQ_OFFS = BOFF_OFFS+INTSIZE;
00204 L_FREQ_OFFS = CODESIZE;
00205 I_FREQ_NUM = 1;
00206 L_FREQ_NUM = 1;
00207
00208 ttype=tt;
00209 break;
00210
00211 case SHIFTBETA_I:
00212 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE;
00213 SUCC2_OFFS = 0;
00214 BOFF_OFFS = 0;
00215 I_FREQ_OFFS= SUCC1_OFFS+CODESIZE;
00216 L_FREQ_OFFS=CODESIZE;
00217 I_FREQ_NUM=1;
00218 L_FREQ_NUM=1;
00219
00220 ttype=tt;
00221 break;
00222
00223 case SHIFTBETA_B:
00224
00225 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE;
00226 SUCC2_OFFS = 0;
00227 BOFF_OFFS = SUCC1_OFFS+CODESIZE;
00228 I_FREQ_OFFS = BOFF_OFFS+INTSIZE;
00229 L_FREQ_OFFS = CODESIZE;
00230 I_FREQ_NUM = 1;
00231 L_FREQ_NUM = 1;
00232
00233 ttype=tt;
00234 break;
00235
00236 case LEAFPROB:
00237 case FLEAFPROB:
00238 SUCC1_OFFS = 0;
00239 SUCC2_OFFS = 0;
00240 BOFF_OFFS = 0;
00241 I_FREQ_OFFS = FLAGS_OFFS+CHARSIZE;
00242 I_FREQ_NUM = 0;
00243 L_FREQ_NUM = 1;
00244
00245 ttype=tt;
00246 break;
00247
00248 case LEAFPROB2:
00249 SUCC1_OFFS =0;
00250 SUCC2_OFFS =0;
00251 BOFF_OFFS =0;
00252 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00253 I_FREQ_NUM=0;
00254 L_FREQ_NUM=2;
00255
00256 ttype=LEAFPROB;
00257 break;
00258
00259 case LEAFPROB3:
00260 SUCC1_OFFS =0;
00261 SUCC2_OFFS =0;
00262 BOFF_OFFS =0;
00263 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00264 I_FREQ_NUM=0;
00265 L_FREQ_NUM=3;
00266
00267 ttype=LEAFPROB;
00268 break;
00269
00270 case LEAFPROB4:
00271 SUCC1_OFFS =0;
00272 SUCC2_OFFS =0;
00273 BOFF_OFFS =0;
00274 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE;
00275 I_FREQ_NUM=0;
00276 L_FREQ_NUM=4;
00277
00278 ttype=LEAFPROB;
00279 break;
00280
00281 default:
00282 assert(tt==COUNT);
00283 }
00284
00285 L_FREQ_OFFS=CODESIZE;
00286 }
00287
00288 int inodesize(int s) {
00289
00290 return I_FREQ_OFFS + I_FREQ_NUM * s;
00291
00292 }
00293
00294 int lnodesize(int s) {
00295 return L_FREQ_OFFS + L_FREQ_NUM * s;
00296 }
00297
00298 };
00299
00300
00301 class ngramtable:tabletype
00302 {
00303
00304 node tree;
00305 int maxlev;
00306 NODETYPE treeflags;
00307 char info[100];
00308 int resolution;
00309 double decay;
00310
00311 storage* mem;
00312
00313 int* memory;
00314 int* occupancy;
00315 long long* mentr;
00316 long long card;
00317
00318 int idx[MAX_NGRAM+1];
00319
00320 int oov_code,oov_size,du_code, bo_code;
00321
00322 int backoff_state;
00323
00324 public:
00325
00326 int corrcounts;
00327
00328 dictionary *dict;
00329
00330
00331
00332
00333 dictionary *filterdict;
00334
00335 ngramtable(char* filename,int maxl,char* is,
00336 dictionary* extdict,
00337 char* filterdictfile,
00338 int googletable=0,
00339 int dstco=0,char* hmask=NULL,int inplen=0,
00340 TABLETYPE tt=FULL,int codesize=DEFCODESIZE);
00341
00342 inline char* ngtype(char *str=NULL) {
00343 if (str!=NULL) strcpy(info,str);
00344 return info;
00345 }
00346
00347 virtual ~ngramtable();
00348
00349 inline void freetree() {
00350 freetree(tree);
00351 };
00352 void freetree(node nd);
00353
00354 void resetngramtable() {
00355
00356
00357 freetree();
00358 memset(tree,0,inodesize(6));
00359
00360
00361 if (maxlev>1) mtflags(tree,INODE | FREQ4);
00362 else if (maxlev==1) mtflags(tree,LNODE | FREQ4);
00363
00364 word(tree,0);
00365 msucc(tree,0);
00366 mtable(tree,NULL);
00367
00368 for (int i=1; i<=maxlev; i++)
00369 mentr[i]=memory[i]=occupancy[i]=0;
00370
00371 }
00372
00373 void stat(int level=4);
00374
00375 inline long long totfreq(long long v=-1) {
00376 return (v==-1?freq(tree,INODE):freq(tree,INODE,v));
00377 }
00378
00379 inline long long btotfreq(long long v=-1) {
00380 return (v==-1?getfreq(tree,treeflags,1):setfreq(tree,treeflags,v,1));
00381 }
00382
00383 inline long long entries(int lev) {
00384 return mentr[lev];
00385 }
00386
00387 int maxlevel() {
00388 return maxlev;
00389 }
00390
00391
00392 void savetxt(char *filename,int sz=0,int googleformat=0);
00393 void loadtxt(char *filename,int googletable=0);
00394
00395
00396 void savebin(char *filename,int sz=0);
00397 void savebin(mfstream& out);
00398 void savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev);
00399
00400 void loadbin(const char *filename);
00401 void loadbin(mfstream& inp);
00402 void loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev);
00403
00404 void loadbinold(char *filename);
00405 void loadbinold(mfstream& inp,node nd,NODETYPE ndt,int lev);
00406
00407 void generate(char *filename,dictionary *extdict=NULL);
00408 void generate_dstco(char *filename,int dstco);
00409 void generate_hmask(char *filename,char* hmask,int inplen=0);
00410
00411 void augment(ngramtable* ngt);
00412
00413 int scan(ngram& ng,ACTION action=CONT,int maxlev=-1) {
00414 return scan(tree,INODE,0,ng,action,maxlev);
00415 }
00416
00417 int succscan(ngram& h,ngram& ng,ACTION action,int lev) {
00418
00419 return scan(h.link,h.info,lev-1,ng,action,lev);
00420 }
00421
00422 double prob(ngram ng);
00423
00424 int scan(node nd,NODETYPE ndt,int lev,ngram& ng,ACTION action=CONT,int maxl=-1);
00425
00426 void show();
00427
00428 void *search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *w,
00429 ACTION action,char **found=(char **)NULL);
00430
00431 int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx);
00432
00433 int put(ngram& ng);
00434 int put(ngram& ng,node nd,NODETYPE ndt,int lev);
00435
00436 inline int get(ngram& ng) {
00437 return get(ng,maxlev,maxlev);
00438 }
00439 virtual int get(ngram& ng,int n,int lev);
00440
00441 int comptbsize(int n);
00442 table *grow(table *tb,NODETYPE ndt,int lev,int n,int sz,NODETYPE oldndt=0);
00443
00444
00445 bool check_dictsize_bound();
00446
00447
00448 inline int putmem(char* ptr,int value,int offs,int size) {
00449 assert(ptr!=NULL);
00450 for (int i=0; i<size; i++)
00451 ptr[offs+i]=(value >> (8 * i)) & 0xff;
00452 return value;
00453 }
00454
00455 inline int getmem(char* ptr,int* value,int offs,int size) {
00456 assert(ptr!=NULL);
00457 *value=ptr[offs] & 0xff;
00458 for (int i=1; i<size; i++)
00459 *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
00460 return *value;
00461 }
00462
00463 inline long putmem(char* ptr,long long value,int offs,int size) {
00464 assert(ptr!=NULL);
00465 for (int i=0; i<size; i++)
00466 ptr[offs+i]=(value >> (8 * i)) & 0xffLL;
00467 return value;
00468 }
00469
00470 inline long getmem(char* ptr,long long* value,int offs,int size) {
00471 assert(ptr!=NULL);
00472 *value=ptr[offs] & 0xff;
00473 for (int i=1; i<size; i++)
00474 *value= *value | ( ( ptr[offs+i] & 0xffLL ) << (8 *i));
00475 return *value;
00476 }
00477
00478 inline void tb2ngcpy(int* wordp,char* tablep,int n=1) {
00479 for (int i=0; i<n; i++)
00480 getmem(tablep,&wordp[i],i*CODESIZE,CODESIZE);
00481 }
00482
00483 inline void ng2tbcpy(char* tablep,int* wordp,int n=1) {
00484 for (int i=0; i<n; i++)
00485 putmem(tablep,wordp[i],i*CODESIZE,CODESIZE);
00486 }
00487
00488 inline int ngtbcmp(int* wordp,char* tablep,int n=1) {
00489 int word;
00490 for (int i=0; i<n; i++) {
00491 getmem(tablep,&word,i*CODESIZE,CODESIZE);
00492 if (wordp[i]!=word) return 1;
00493 }
00494 return 0;
00495 }
00496
00497 inline int word(node nd,int value) {
00498 putmem(nd,value,WORD_OFFS,CODESIZE);
00499 return value;
00500 }
00501
00502 inline int word(node nd) {
00503 int v;
00504 getmem(nd,&v,WORD_OFFS,CODESIZE);
00505 return v;
00506 }
00507
00508 unsigned char mtflags(node nd,unsigned char value) {
00509 return *(unsigned char *)(nd+FLAGS_OFFS)=value;
00510 }
00511
00512 unsigned char mtflags(node nd) {
00513 return *(unsigned char *)(nd+FLAGS_OFFS);
00514 }
00515
00516 int codecmp(char * a,char *b) {
00517 register int i,result;
00518 for (i=(CODESIZE-1); i>=0; i--) {
00519 result=(unsigned char)a[i]-(unsigned char)b[i];
00520 if(result) return result;
00521 }
00522 return 0;
00523 };
00524
00525 int codediff(node a,node b) {
00526 return word(a)-word(b);
00527 };
00528
00529
00530 int update(ngram ng) {
00531
00532 if (!get(ng,ng.size,ng.size)) {
00533 cerr << "cannot find " << ng << "\n";
00534 exit (1);
00535 }
00536
00537 freq(ng.link,ng.pinfo,ng.freq);
00538
00539 return 1;
00540 }
00541
00542 long long freq(node nd,NODETYPE ndt,long long value) {
00543 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00544
00545 if (ndt & FREQ1)
00546 putmem(nd,value,offs,1);
00547 else if (ndt & FREQ2)
00548 putmem(nd,value,offs,2);
00549 else if (ndt & FREQ3)
00550 putmem(nd,value,offs,3);
00551 else if (ndt & FREQ4)
00552 putmem(nd,value,offs,4);
00553 else
00554 putmem(nd,value,offs,6);
00555 return value;
00556 }
00557
00558 long long freq(node nd,NODETYPE ndt) {
00559 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00560 long long value;
00561
00562 if (ndt & FREQ1)
00563 getmem(nd,&value,offs,1);
00564 else if (ndt & FREQ2)
00565 getmem(nd,&value,offs,2);
00566 else if (ndt & FREQ3)
00567 getmem(nd,&value,offs,3);
00568 else if (ndt & FREQ4)
00569 getmem(nd,&value,offs,4);
00570 else
00571 getmem(nd,&value,offs,6);
00572
00573 return value;
00574 }
00575
00576
00577 long long setfreq(node nd,NODETYPE ndt,long long value,int index=0) {
00578 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00579
00580 if (ndt & FREQ1)
00581 putmem(nd,value,offs+index * 1,1);
00582 else if (ndt & FREQ2)
00583 putmem(nd,value,offs+index * 2,2);
00584 else if (ndt & FREQ3)
00585 putmem(nd,value,offs+index * 3,3);
00586 else if (ndt & FREQ4)
00587 putmem(nd,value,offs+index * 4,4);
00588 else
00589 putmem(nd,value,offs+index * 6,6);
00590
00591 return value;
00592 }
00593
00594 long long getfreq(node nd,NODETYPE ndt,int index=0) {
00595 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00596
00597 long long value;
00598
00599 if (ndt & FREQ1)
00600 getmem(nd,&value,offs+ index * 1,1);
00601 else if (ndt & FREQ2)
00602 getmem(nd,&value,offs+ index * 2,2);
00603 else if (ndt & FREQ3)
00604 getmem(nd,&value,offs+ index * 3,3);
00605 else if (ndt & FREQ4)
00606 getmem(nd,&value,offs+ index * 4,4);
00607 else
00608 getmem(nd,&value,offs+ index * 6,6);
00609
00610 return value;
00611 }
00612
00613
00614 double boff(node nd) {
00615 int value=0;
00616 getmem(nd,&value,BOFF_OFFS,INTSIZE);
00617
00618 return double (value/(double)1000000000.0);
00619 }
00620
00621
00622 double myround(double x) {
00623 long int i=(long int)(x);
00624 return (x-i)>0.500?i+1.0:(double)i;
00625 }
00626
00627 int boff(node nd,double value) {
00628 int v=(int)myround(value * 1000000000.0);
00629 putmem(nd,v,BOFF_OFFS,INTSIZE);
00630
00631 return 1;
00632 }
00633
00634 int succ2(node nd,int value) {
00635 putmem(nd,value,SUCC2_OFFS,CODESIZE);
00636 return value;
00637 }
00638
00639 int succ2(node nd) {
00640 int value=0;
00641 getmem(nd,&value,SUCC2_OFFS,CODESIZE);
00642 return value;
00643 }
00644
00645 int succ1(node nd,int value) {
00646 putmem(nd,value,SUCC1_OFFS,CODESIZE);
00647 return value;
00648 }
00649
00650 int succ1(node nd) {
00651 int value=0;
00652 getmem(nd,&value,SUCC1_OFFS,CODESIZE);
00653 return value;
00654 }
00655
00656 int msucc(node nd,int value) {
00657 putmem(nd,value,MSUCC_OFFS,CODESIZE);
00658 return value;
00659 }
00660
00661 int msucc(node nd) {
00662 int value;
00663 getmem(nd,&value,MSUCC_OFFS,CODESIZE);
00664 return value;
00665 }
00666
00667 table mtable(node nd) {
00668 char v[PTRSIZE];;
00669 for (int i=0; i<PTRSIZE; i++)
00670 v[i]=nd[MTAB_OFFS+i];
00671
00672 return *(table *)v;
00673 }
00674
00675 table mtable(node nd,table value) {
00676 char *v=(char *)&value;
00677 for (int i=0; i<PTRSIZE; i++)
00678 nd[MTAB_OFFS+i]=v[i];
00679 return value;
00680 }
00681
00682 int mtablesz(node nd) {
00683 if (mtflags(nd) & LNODE) {
00684 if (mtflags(nd) & FREQ1)
00685 return lnodesize(1);
00686 else if (mtflags(nd) & FREQ2)
00687 return lnodesize(2);
00688 else if (mtflags(nd) & FREQ3)
00689 return lnodesize(3);
00690 else if (mtflags(nd) & FREQ4)
00691 return lnodesize(4);
00692 else
00693 return lnodesize(6);
00694 } else if (mtflags(nd) & INODE) {
00695 if (mtflags(nd) & FREQ1)
00696 return inodesize(1);
00697 else if (mtflags(nd) & FREQ2)
00698 return inodesize(2);
00699 else if (mtflags(nd) & FREQ3)
00700 return inodesize(3);
00701 else if (mtflags(nd) & FREQ4)
00702 return inodesize(4);
00703 else
00704 return inodesize(6);
00705 } else {
00706 cerr << "node has wrong flags\n";
00707 exit(1);
00708 }
00709 }
00710
00711
00712 int bo_state(int value=-1) {
00713 return (value==-1?backoff_state:backoff_state=value);
00714 }
00715
00716
00717 };
00718
00719 #endif
00720
00721
00722
00723