00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include "mfstream.h"
00026 #include "math.h"
00027 #include "mempool.h"
00028 #include "htable.h"
00029 #include "dictionary.h"
00030 #include "n_gram.h"
00031 #include "ngramtable.h"
00032
00033 ngramtable::ngramtable(char* filename,int maxl,char* ,
00034 dictionary* extdict ,char* filterdictfile,
00035 int googletable,int dstco,char* hmask, int inplen,TABLETYPE ttype,
00036 int codesize): tabletype(ttype,codesize)
00037 {
00038
00039 cerr << "[codesize " << CODESIZE << "]\n";
00040 char header[100];
00041
00042 info[0]='\0';
00043
00044 corrcounts=0;
00045
00046 if (filename) {
00047 int n;
00048 mfstream inp(filename,ios::in );
00049
00050 inp >> header;
00051
00052 if (strncmp(header,"nGrAm",5)==0 || strncmp(header,"NgRaM",5)==0) {
00053 inp >> n;
00054 inp >> card;
00055 inp >> info;
00056 if (strcmp(info,"LM_")==0) {
00057 inp >> resolution;
00058 inp >> decay;
00059 sprintf(info,"%s %d %f",info,resolution,decay);
00060 } else {
00061 resolution=10000000;
00062 decay=0.9999;
00063 }
00064
00065 maxl=n;
00066
00067 cerr << n << " " << card << " " << info << "\n";
00068 }
00069
00070 inp.close();
00071 }
00072
00073 if (!maxl) {
00074 cerr << "ngramtable: ngram size must be specified\n";
00075 exit(1);
00076 }
00077
00078
00079 if (dstco && (maxl!=2) && (maxl!=3)) {
00080 cerr << "distant co-occurrences work with 2-gram and 3-gram!\n";
00081 exit(1);
00082 }
00083
00084 maxlev=maxl;
00085
00086
00087
00088 treeflags=INODE | FREQ6;
00089 tree=(node) new char[inodesize(6)];
00090 memset(tree,0,inodesize(6));
00091
00092
00093
00094 if (maxlev>1)
00095 mtflags(tree,INODE | FREQ4);
00096 else if (maxlev==1)
00097 mtflags(tree,LNODE | FREQ4);
00098 else {
00099 cerr << "ngramtable: wrong level setting\n";
00100 exit(1);
00101 }
00102
00103 word(tree,0);
00104
00105 if (I_FREQ_NUM)
00106 freq(tree,treeflags,0);
00107
00108 msucc(tree,0);
00109 mtable(tree,NULL);
00110
00111 mem=new storage(256,10000);
00112
00113 mentr=new long long[maxlev+1];
00114 memory= new int[maxlev+1];
00115 occupancy= new int[maxlev+1];
00116
00117
00118 mentr[0]=1;
00119 memory[0]=inodesize(6);
00120 occupancy[0]=inodesize(6);
00121
00122 for (int i=1; i<=maxlev; i++)
00123 mentr[i]=memory[i]=occupancy[i]=0;
00124
00125 dict=new dictionary(NULL,1000000);
00126
00127 if (!filename) return ;
00128
00129 filterdict=NULL;
00130 if (filterdictfile) {
00131 filterdict=new dictionary(filterdictfile,1000000);
00132
00133
00134
00135
00136
00137
00138 }
00139
00140
00141
00142 if ((strncmp(header,"ngram",5)==0) ||
00143 (strncmp(header,"NGRAM",5)==0)) {
00144 cerr << "this ngram file format is no more supported!\n";
00145 exit(1);
00146 }
00147
00148 if (strncmp(header,"nGrAm",5)==0)
00149 loadtxt(filename);
00150 else if (strncmp(header,"NgRaM",5)==0)
00151 loadbin(filename);
00152 else if (dstco>0)
00153 generate_dstco(filename,dstco);
00154 else if (hmask != NULL)
00155 generate_hmask(filename,hmask,inplen);
00156 else if (googletable)
00157 loadtxt(filename,googletable);
00158 else
00159 generate(filename,extdict);
00160
00161
00162
00163 if (tbtype()==LEAFPROB) {
00164 du_code=dict->encode(DUMMY_);
00165 bo_code=dict->encode(BACKOFF_);
00166 }
00167 }
00168
00169 void ngramtable::savetxt(char *filename,int depth,int googleformat)
00170 {
00171
00172 if (depth>maxlev) {
00173 cerr << "savetxt: wrong n-gram size\n";
00174 exit(1);
00175 }
00176
00177 depth=(depth>0?depth:maxlev);
00178
00179 card=mentr[depth];
00180
00181 ngram ng(dict);
00182
00183 if (googleformat)
00184 cerr << "savetxt in Google format: nGrAm " << depth << " " << card << " " << info << "\n";
00185 else
00186 cerr << "savetxt: nGrAm " << depth << " " << card << " " << info << "\n";
00187
00188 mfstream out(filename,ios::out );
00189
00190 if (!googleformat)
00191 out << "nGrAm " << depth << " " << card << " " << info << "\n";
00192
00193 if (!googleformat)
00194 dict->save(out);
00195
00196 scan(ng,INIT,depth);
00197
00198 while(scan(ng,CONT,depth)) out << ng <<"\n";
00199
00200 cerr << "\n";
00201
00202 out.close();
00203 }
00204
00205
00206 void ngramtable::loadtxt(char *filename,int googletable)
00207 {
00208
00209 ngram ng(dict);;
00210
00211 cerr << "loadtxt:" << (googletable?"google format":"std table");
00212
00213 mfstream inp(filename,ios::in);
00214
00215 int i,c=0;
00216
00217 if (googletable) {
00218 dict->incflag(1);
00219 } else {
00220 char header[100];
00221 inp.getline(header,100);
00222 cerr << header ;
00223 dict->load(inp);
00224 }
00225
00226 while (!inp.eof()) {
00227
00228 for (i=0; i<maxlev; i++) inp >> ng;
00229
00230 inp >> ng.freq;
00231
00232 if (ng.size==0) continue;
00233
00234
00235 if (googletable) dict->incfreq(*ng.wordp(1),ng.freq);
00236
00237
00238
00239
00240
00241 if (filterdict) {
00242 int code=filterdict->encode(dict->decode(*ng.wordp(maxlev)));
00243 if (code!=filterdict->oovcode()) put(ng);
00244 } else put(ng);
00245
00246 ng.size=0;
00247
00248 if (!(++c % 1000000)) cerr << ".";
00249
00250 }
00251
00252 if (googletable) {
00253 dict->incflag(0);
00254 }
00255
00256 cerr << "\n";
00257
00258 inp.close();
00259 }
00260
00261
00262
00263 void ngramtable::savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev)
00264 {
00265
00266 out.write(nd+WORD_OFFS,CODESIZE);
00267
00268
00269
00270 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00271
00272 int frnum=1;
00273 if (tbtype()==LEAFPROB && (ndt & LNODE))
00274 frnum=L_FREQ_NUM;
00275
00276 if ((ndt & LNODE) || I_FREQ_NUM) {
00277 if (ndt & FREQ1)
00278 out.write(nd+offs,1 * frnum);
00279 else if (ndt & FREQ2)
00280 out.write(nd+offs,2 * frnum);
00281 else if (ndt & FREQ3)
00282 out.write(nd+offs,3 * frnum);
00283 else
00284 out.write(nd+offs,INTSIZE * frnum);
00285 }
00286
00287 if ((lev <mlev) && (ndt & INODE)) {
00288
00289 unsigned char fl=mtflags(nd);
00290 if (lev==(mlev-1))
00291
00292 fl=(fl & ~INODE) | LNODE;
00293
00294 out.write((const char*) &fl,CHARSIZE);
00295 fl=mtflags(nd);
00296
00297 out.write(nd+MSUCC_OFFS,CODESIZE);
00298
00299 int msz=mtablesz(nd);
00300 int m=msucc(nd);
00301
00302 for (int i=0; i<m; i++)
00303 savebin(out,mtable(nd) + i * msz,fl,lev+1,mlev);
00304 }
00305 }
00306
00307
00308 void ngramtable::savebin(mfstream& out)
00309 {
00310
00311 int depth=maxlev;
00312
00313 card=mentr[depth];
00314
00315 cerr << "ngramtable::savebin ";
00316
00317 out.writex((char *)&depth,INTSIZE);
00318
00319 out.write((char *)&treeflags,CHARSIZE);
00320
00321 savebin(out,tree,treeflags,0,depth);
00322
00323 cerr << "\n";
00324 }
00325
00326
00327 void ngramtable::savebin(char *filename,int depth)
00328 {
00329
00330 if (depth > maxlev) {
00331 cerr << "savebin: wrong n-gram size\n";
00332 exit(1);
00333 }
00334
00335 depth=(depth>0?depth:maxlev);
00336
00337 card=mentr[depth];
00338
00339 cerr << "savebin NgRaM " << depth << " " << card;
00340
00341 mfstream out(filename,ios::out );
00342
00343 if (dict->oovcode()!=-1)
00344 out << "NgRaM_ " << depth << " " << card << " " << info << "\n";
00345 else
00346 out << "NgRaM " << depth << " " << card << " " << info << "\n";
00347
00348 dict->save(out);
00349
00350 out.writex((char *)&depth,INTSIZE);
00351
00352 out.write((char *)&treeflags,CHARSIZE);
00353
00354 savebin(out,tree,treeflags,0,depth);
00355
00356 out.close();
00357
00358 cerr << "\n";
00359 }
00360
00361
00362 void ngramtable::loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev)
00363 {
00364 static int c=0;
00365
00366
00367 inp.read(nd+WORD_OFFS,CODESIZE);
00368
00369
00370 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS;
00371
00372 int frnum=1;
00373 if (tbtype()==LEAFPROB && (ndt & LNODE))
00374 frnum=L_FREQ_NUM;
00375
00376 if ((ndt & LNODE) || I_FREQ_NUM) {
00377 if (ndt & FREQ1)
00378 inp.read(nd+offs,1 * frnum);
00379 else if (ndt & FREQ2)
00380 inp.read(nd+offs,2 * frnum);
00381 else if (ndt & FREQ3)
00382 inp.read(nd+offs,3 * frnum);
00383 else
00384 inp.read(nd+offs,4 * frnum);
00385 }
00386
00387 if (ndt & INODE) {
00388
00389
00390 inp.read(nd+FLAGS_OFFS,CHARSIZE);
00391 unsigned char fl=mtflags(nd);
00392
00393
00394 inp.read(nd+MSUCC_OFFS,CODESIZE);
00395 int m=msucc(nd);
00396
00397 if (m>0) {
00398
00399 int msz=mtablesz(nd);
00400 table mtb=mtable(nd);
00401
00402 grow(&mtb,INODE,lev+1,m,msz);
00403
00404 for (int i=0; i<m; i++)
00405 loadbin(inp,mtb + i * msz,fl,lev+1);
00406
00407 mtable(nd,mtb);
00408 }
00409
00410 mentr[lev+1]+=m;
00411 occupancy[lev+1]+=(m * mtablesz(nd));
00412
00413 } else if (!(++c % 1000000)) cerr << ".";
00414
00415 }
00416
00417
00418
00419 void ngramtable::loadbin(mfstream& inp)
00420 {
00421
00422 cerr << "loadbin ";
00423
00424 inp.readx((char *)&maxlev,INTSIZE);
00425 inp.read((char *)&treeflags,CHARSIZE);
00426
00427 loadbin(inp,tree,treeflags,0);
00428
00429 cerr << "\n";
00430 }
00431
00432
00433 void ngramtable::loadbin(const char *filename)
00434 {
00435
00436 cerr << "loadbin ";
00437 mfstream inp(filename,ios::in );
00438
00439
00440 char header[100];
00441 inp.getline(header,100);
00442
00443 cerr << header ;
00444
00445 dict->load(inp);
00446
00447 inp.readx((char *)&maxlev,INTSIZE);
00448 inp.read((char *)&treeflags,CHARSIZE);
00449
00450 loadbin(inp,tree,treeflags,0);
00451
00452 inp.close();
00453
00454 cerr << "\n";
00455 }
00456
00457
00458 void ngramtable::generate(char *filename, dictionary* extdict)
00459 {
00460 mfstream inp(filename,ios::in);
00461 int i,c=0;
00462
00463 if (!inp) {
00464 cerr << "cannot open " << filename << "\n";
00465 exit(1);
00466 }
00467
00468 cerr << "load:";
00469
00470 ngram ng(extdict==NULL?dict:extdict);
00471 if (extdict) dict->genoovcode();
00472
00473 ngram ng2(dict);
00474 dict->incflag(1);
00475
00476 cerr << "prepare initial n-grams to make table consistent\n";
00477 for (i=1; i<maxlev; i++) {
00478 ng.pushw(dict->BoS());
00479 ng.freq=1;
00480 };
00481
00482 while (inp >> ng) {
00483
00484 if (ng.size>maxlev) ng.size=maxlev;
00485
00486 ng2.trans(ng);
00487
00488 check_dictsize_bound();
00489
00490 if (ng2.size) dict->incfreq(*ng2.wordp(1),1);
00491
00492
00493
00494
00495 if (filterdict) {
00496 int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev)));
00497 if (code!=filterdict->oovcode()) put(ng2);
00498 } else put(ng2);
00499
00500 if (!(++c % 1000000)) cerr << ".";
00501
00502 }
00503
00504 cerr << "adding some more n-grams to make table consistent\n";
00505 for (i=1; i<=maxlev; i++) {
00506 ng2.pushw(dict->BoS());
00507 ng2.freq=1;
00508
00509
00510
00511
00512 if (filterdict) {
00513 int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev)));
00514 if (code!=filterdict->oovcode()) put(ng2);
00515 } else put(ng2);
00516 };
00517
00518 dict->incflag(0);
00519 inp.close();
00520 strcpy(info,"ngram");
00521
00522 cerr << "\n";
00523 }
00524
00525 void ngramtable::generate_hmask(char *filename,char* hmask,int inplen)
00526 {
00527 mfstream inp(filename,ios::in);
00528 int i,c=0;
00529 int selmask[MAX_NGRAM];
00530
00531 if (!inp) {
00532 cerr << "cannot open " << filename << "\n";
00533 exit(1);
00534 }
00535
00536
00537 i=0;
00538 selmask[i++]=1;
00539 for (c=0; c< (int)strlen(hmask); c++) {
00540 cerr << hmask[c] << "\n";
00541 if (hmask[c] == '1')
00542 selmask[i++]=c+2;
00543 }
00544 if (i!= maxlev) {
00545 cerr << "wrong mask: 1 bits=" << i << " maxlev=" << maxlev << "\n";
00546 exit(1);
00547 }
00548
00549
00550
00551 cerr << "load:";
00552
00553 ngram ng(dict);
00554 ngram ng2(dict);
00555 dict->incflag(1);
00556 while (inp >> ng) {
00557
00558 if (inplen && ng.size<inplen) continue;
00559
00560 ng2.trans(ng);
00561 ng.size=0;
00562
00563 if (ng2.size >= selmask[maxlev-1]) {
00564 for (i=0; i<maxlev; i++)
00565 *ng2.wordp(i+1)=*ng2.wordp(selmask[i]);
00566
00567
00568 check_dictsize_bound();
00569
00570 put(ng2);
00571 }
00572
00573 if (ng2.size) dict->incfreq(*ng2.wordp(1),1);
00574
00575 if (!(++c % 1000000)) cerr << ".";
00576 };
00577
00578 dict->incflag(0);
00579 inp.close();
00580 sprintf(info,"hm%s\n",hmask);
00581
00582 cerr << "\n";
00583 }
00584
00585 int cmpint(const void *a,const void *b)
00586 {
00587 return (*(int *)b)-(*(int *)a);
00588 }
00589
00590 void ngramtable::generate_dstco(char *filename,int dstco)
00591 {
00592 mfstream inp(filename,ios::in);
00593 int c=0;
00594
00595 if (!inp) {
00596 cerr << "cannot open " << filename << "\n";
00597 exit(1);
00598 }
00599
00600 cerr << "load distant co-occurrences:";
00601 if (dstco>MAX_NGRAM) {
00602 cerr << "window size (" << dstco << ") exceeds MAXNGRAM\n";
00603 inp.close();
00604 exit (1);
00605 }
00606
00607 ngram ng(dict);
00608 ngram ng2(dict);
00609 ngram dng(dict);
00610 dict->incflag(1);
00611
00612 while (inp >> ng) {
00613 if (ng.size) {
00614
00615 ng2.trans(ng);
00616
00617 if (ng2.size>dstco) ng2.size=dstco;
00618
00619 check_dictsize_bound();
00620
00621 dict->incfreq(*ng2.wordp(1),1);
00622
00623 if (maxlev == 1 )
00624 cerr << "maxlev is wrong! (Possible values are 2 or 3)\n";
00625
00626 else if (maxlev == 2 ) {
00627 dng.size=2;
00628 dng.freq=1;
00629
00630
00631
00632 for (int i=2; i<=ng2.size; i++) {
00633
00634 if (*ng2.wordp(1)<*ng2.wordp(i)) {
00635 *dng.wordp(2)=*ng2.wordp(i);
00636 *dng.wordp(1)=*ng2.wordp(1);
00637 } else {
00638 *dng.wordp(1)=*ng2.wordp(i);
00639 *dng.wordp(2)=*ng2.wordp(1);
00640 }
00641
00642 put(dng);
00643 }
00644 if (!(++c % 1000000)) cerr << ".";
00645 } else {
00646 dng.size=3;
00647 dng.freq=1;
00648
00649
00650 int ar[3];
00651
00652 ar[0]=*ng2.wordp(1);
00653 for (int i=2; i<ng2.size; i++) {
00654 ar[1]=*ng2.wordp(i);
00655 for (int j=i+1; j<=ng2.size; j++) {
00656 ar[2]=*ng2.wordp(j);
00657
00658
00659 qsort(ar,3,sizeof(int),cmpint);
00660
00661 *dng.wordp(1)=ar[0];
00662 *dng.wordp(2)=ar[1];
00663 *dng.wordp(3)=ar[2];
00664
00665
00666
00667
00668
00669
00670 put(dng);
00671 }
00672 }
00673 }
00674 }
00675 }
00676 dict->incflag(0);
00677 inp.close();
00678 sprintf(info,"co-occ%d\n",dstco);
00679 cerr << "\n";
00680 }
00681
00682
00683
00684 void ngramtable::augment(ngramtable* ngt)
00685 {
00686
00687 if (ngt->maxlev != maxlev) {
00688 cerr << "ngt augmentation is not possible "
00689 << "due to table incompatibility!";
00690 exit(1);
00691 }
00692
00693 if (ngt->dict->oovcode()!=-1)
00694 cerr <<"oov: " << ngt->dict->freq(ngt->dict->oovcode()) << "\n";
00695 cerr <<"size: " << ngt->dict->size() << "\n";
00696
00697 if (dict->oovcode()!=-1)
00698 cerr <<"oov: " << dict->freq(dict->oovcode()) << "\n";
00699 cerr <<"size: " << dict->size() << "\n";
00700
00701
00702 dict->incflag(1);
00703 cerr << "augmenting ngram table\n";
00704 ngram ng1(ngt->dict);
00705 ngram ng2(dict);
00706 ngt->scan(ng1,INIT);
00707 int c=0;
00708 while (ngt->scan(ng1,CONT)) {
00709 ng2.trans(ng1);
00710 put(ng2);
00711 if ((++c % 1000000) ==0) cerr <<".";
00712 }
00713 cerr << "\n";
00714
00715 for (int i=0; i<ngt->dict->size(); i++)
00716 dict->incfreq(dict->encode(ngt->dict->decode(i)),
00717 ngt->dict->freq(i));
00718
00719 dict->incflag(0);
00720
00721 int oov=dict->getcode(dict->OOV());
00722
00723 if (oov>=0) {
00724 dict->oovcode(oov);
00725 }
00726
00727 cerr << "oov: " << dict->freq(dict->oovcode()) << "\n";
00728 cerr << "size: " << dict->size() << "\n";
00729 }
00730
00731 void ngramtable::show()
00732 {
00733
00734 ngram ng(dict);
00735
00736 scan(ng,INIT);
00737 cout << "Stampo contenuto della tabella\n";
00738 while (scan(ng)) {
00739 cout << ng << "\n";
00740 }
00741 }
00742
00743
00744
00745 int ngramtable::mybsearch(char *ar, int n, int size, unsigned char *key, int *idx)
00746 {
00747 if (n==0) return 0;
00748
00749 register int low = 0, high = n;
00750 *idx=0;
00751 register unsigned char *p=NULL;
00752 int result;
00753
00754 #ifdef INTERP_SEARCH
00755 char* lp;
00756 char* hp;
00757 #endif
00758
00759
00760
00761
00762
00763
00764
00765 while (low < high) {
00766
00767
00768 #ifdef INTERP_SEARCH
00769
00770
00771 if ((high-low)>=10000) {
00772
00773 lp=(char *) (ar + (low * size));
00774 if (codecmp((char *)key,lp)<0) {
00775 *idx=low;
00776 return 0;
00777 }
00778
00779 hp=(char *) (ar + ((high-1) * size));
00780 if (codecmp((char *)key,hp)>0) {
00781 *idx=high;
00782 return 0;
00783 }
00784
00785 *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp);
00786 } else
00787 #endif
00788 *idx = (low + high) / 2;
00789
00790
00791
00792
00793 p = (unsigned char *) (ar + (*idx * size));
00794 result=codecmp((char *)key,(char *)p);
00795
00796 if (result < 0) {
00797 high = *idx;
00798 } else if (result > 0) {
00799 low = ++(*idx);
00800 } else
00801 return 1;
00802 }
00803
00804 *idx=low;
00805
00806 return 0;
00807
00808 }
00809
00810 void *ngramtable::search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *ngp,
00811 ACTION action,char **found)
00812 {
00813
00814 char w[CODESIZE];
00815 putmem(w,ngp[0],0,CODESIZE);
00816 int wint=ngp[0];
00817
00818
00819
00820
00821 if (found) *found=NULL;
00822
00823 int idx=0;
00824
00825 switch(action) {
00826
00827 case ENTER:
00828
00829 if (!*tb ||
00830 !mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) {
00831
00832 grow(tb,ndt,lev,n,sz);
00833
00834
00835
00836 memmove(*tb + (idx+1) * sz,
00837 *tb + idx * sz,
00838 (n-idx) * sz);
00839
00840 memset(*tb + idx * sz , 0 , sz);
00841
00842 word(*tb + idx * sz, wint);
00843
00844 } else if (found) *found=*tb + ( idx * sz );
00845
00846 return *tb + ( idx * sz );
00847
00848 break;
00849
00850
00851 case FIND:
00852
00853 if (!*tb ||
00854 !mybsearch(*tb,n,sz,(unsigned char *)w,&idx))
00855 return 0;
00856 else if (found) *found=*tb + (idx * sz);
00857
00858 return *tb + (idx * sz);
00859
00860 break;
00861
00862 case DELETE:
00863
00864 if (*tb &&
00865 mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) {
00866
00867
00868 static char buffer[100];
00869
00870 memcpy(buffer,*tb + idx * sz , sz);
00871
00872 if (idx <(n-1))
00873 memmove(*tb + idx * sz,
00874 *tb + (idx + 1) * sz,
00875 (n-idx-1) * sz);
00876
00877
00878
00879 memcpy(*tb + (n-1) * sz , buffer , sz);
00880
00881 if (found) *found=*tb + (n-1) * sz ;
00882
00883 return *tb + (n-1) * sz ;
00884
00885 } else
00886
00887 return NULL;
00888
00889 break;
00890
00891 default:
00892 cerr << "this option is not implemented yet\n";
00893 break;
00894 }
00895
00896 return NULL;
00897
00898 }
00899
00900 int ngramtable::comptbsize(int n)
00901 {
00902
00903 if (n>16384)
00904 return(n/16384)*16384+(n % 16384?16384:0);
00905 else if (n>8192) return 16384;
00906 else if (n>4096) return 8192;
00907 else if (n>2048) return 4096;
00908 else if (n>1024) return 2048;
00909 else if (n>512) return 1024;
00910 else if (n>256) return 512;
00911 else if (n>128) return 256;
00912 else if (n>64) return 128;
00913 else if (n>32) return 64;
00914 else if (n>16) return 32;
00915 else if (n>8) return 16;
00916 else if (n>4) return 8;
00917 else if (n>2) return 4;
00918 else if (n>1) return 2;
00919 else return 1;
00920
00921 }
00922
00923
00924 char **ngramtable::grow(table *tb,NODETYPE ndt,int lev,
00925 int n,int sz,NODETYPE oldndt)
00926 {
00927 int inc;
00928 int num;
00929
00930
00931
00932 if (oldndt==0) {
00933
00934 if ((*tb==NULL) && n>0) {
00935
00936
00937
00938 if (n>16384)
00939 inc=(n/16384)*16384+(n % 16384?16384:0);
00940 else if (n>8192) inc=16384;
00941 else if (n>4096) inc=8192;
00942 else if (n>2048) inc=4096;
00943 else if (n>1024) inc=2048;
00944 else if (n>512) inc=1024;
00945 else if (n>256) inc=512;
00946 else if (n>128) inc=256;
00947 else if (n>64) inc=128;
00948 else if (n>32) inc=64;
00949 else if (n>16) inc=32;
00950 else if (n>8) inc=16;
00951 else if (n>4) inc=8;
00952 else if (n>2) inc=4;
00953 else if (n>1) inc=2;
00954 else inc=1;
00955
00956 n=0;
00957
00958 }
00959
00960 else {
00961
00962
00963
00964
00965
00966 if ((n>=16384) && !(n % 16384)) inc=16384;
00967 else {
00968 switch (n) {
00969 case 0:
00970 inc=1;
00971 break;
00972 case 1:
00973 case 2:
00974 case 4:
00975 case 8:
00976 case 16:
00977 case 32:
00978 case 64:
00979 case 128:
00980 case 256:
00981 case 512:
00982 case 1024:
00983 case 2048:
00984 case 4096:
00985 case 8192:
00986 inc=n;
00987 break;
00988 default:
00989 return tb;
00990 }
00991 }
00992 }
00993
00994 table ntb=(char *)mem->reallocate(*tb,n * sz,(n + inc) * sz);
00995
00996 memory[lev]+= (inc * sz);
00997
00998 *tb=ntb;
00999 }
01000
01001 else {
01002
01003
01004
01005 int oldsz=0;
01006
01007
01008 num=comptbsize(n);
01009
01010 if ((ndt & INODE) && I_FREQ_NUM) {
01011 if (oldndt & FREQ1)
01012 oldsz=inodesize(1);
01013 else if (oldndt & FREQ2)
01014 oldsz=inodesize(2);
01015 else if (oldndt & FREQ3)
01016 oldsz=inodesize(3);
01017 else if (oldndt & FREQ4)
01018 oldsz=inodesize(4);
01019 else {
01020 cerr << "funzione non prevista\n";
01021 exit(1);
01022 }
01023 } else if (ndt & LNODE) {
01024 if (oldndt & FREQ1)
01025 oldsz=lnodesize(1);
01026 else if (oldndt & FREQ2)
01027 oldsz=lnodesize(2);
01028 else if (oldndt & FREQ3)
01029 oldsz=lnodesize(3);
01030 else if (oldndt & FREQ4)
01031 oldsz=lnodesize(4);
01032 else {
01033 cerr << "funzione non prevista\n";
01034 exit(1);
01035 }
01036 }
01037
01038 table ntb=(char *)mem->allocate(num * sz);
01039 memset((char *)ntb,0,num * sz);
01040
01041 if (ndt & INODE)
01042 for (int i=0; i<n; i++) {
01043 word(ntb+i*sz,word(*tb+i*oldsz));
01044 msucc(ntb+i*sz,msucc(*tb+i*oldsz));
01045 mtflags(ntb+i*sz,mtflags(*tb+i*oldsz));
01046 mtable(ntb+i*sz,mtable(*tb+i*oldsz));
01047 for (int j=0; j<I_FREQ_NUM; j++)
01048 setfreq(ntb+i*sz,ndt,getfreq(*tb+i*oldsz,oldndt,j),j);
01049 }
01050 else
01051 for (int i=0; i<n; i++) {
01052 word(ntb+i*sz,word(*tb+i*oldsz));
01053 for (int j=0; j<L_FREQ_NUM; j++)
01054 setfreq(ntb+i*sz,ndt,getfreq(*tb+i*oldsz,oldndt,j),j);
01055 }
01056
01057 mem->free(*tb,num * oldsz);
01058 memory[lev]+=num * (sz - oldsz);
01059 occupancy[lev]+=n * (sz - oldsz);
01060
01061 *tb=ntb;
01062 }
01063
01064 return tb;
01065
01066 };
01067
01068
01069 int ngramtable::put(ngram& ng)
01070 {
01071
01072 return ngramtable::put(ng,tree,treeflags,0);
01073
01074 }
01075
01076 int ngramtable::put(ngram& ng,node nd,NODETYPE ndt,int lev)
01077 {
01078 char *found;
01079 node subnd;
01080
01081 if (ng.size<maxlev) return 0;
01082
01083
01084
01085
01086
01087
01088
01089
01090
01091 for (int l=lev; l<maxlev; l++) {
01092
01093 if (I_FREQ_NUM || (ndt & LNODE))
01094 freq(nd,ndt,freq(nd,ndt) + ng.freq);
01095
01096 table mtb=mtable(nd);
01097
01098
01099
01100 subnd=(char *)
01101 search(&mtb,
01102 mtflags(nd),
01103 l+1,
01104 msucc(nd),
01105 mtablesz(nd),
01106 ng.wordp(maxlev-l),
01107 ENTER,&found);
01108
01109 if (!found) {
01110
01111 msucc(nd,msucc(nd)+1);
01112
01113 mentr[l+1]++;
01114 occupancy[l+1]+=mtablesz(nd);
01115
01116 unsigned char freq_flag;
01117 if (I_FREQ_NUM)
01118
01119
01120
01121 freq_flag=(ng.freq>65535?FREQ4:FREQ1);
01122 else
01123
01124
01125
01126
01127
01128 freq_flag=L_FREQ_SIZE;
01129
01130 if ((l+1)<maxlev) {
01131 if ((l+2)<maxlev)
01132 mtflags(subnd,INODE | freq_flag);
01133 else
01134 mtflags(subnd,LNODE | freq_flag);
01135
01136 }
01137 }
01138
01139
01140
01141
01142
01143 NODETYPE oldndt=mtflags(nd);
01144
01145 if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) &&
01146 (mtflags(nd) & FREQ1) &&
01147 ((freq(subnd,mtflags(nd))+ng.freq)>255))
01148
01149 mtflags(nd,(mtflags(nd) & ~FREQ1) | FREQ2);
01150
01151
01152 if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) &&
01153 (mtflags(nd) & FREQ2) &&
01154 ((freq(subnd,mtflags(nd))+ng.freq)>65535))
01155
01156 mtflags(nd,(mtflags(nd) & ~FREQ2) | FREQ3);
01157
01158
01159 if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) &&
01160 (mtflags(nd) & FREQ3) &&
01161 ((freq(subnd,mtflags(nd))+ng.freq)>16777215))
01162
01163 mtflags(nd,(mtflags(nd) & ~FREQ3) | FREQ4);
01164
01165 if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) &&
01166 (mtflags(nd) & FREQ4) &&
01167 ((freq(subnd,mtflags(nd))+ng.freq)>4294967295LL))
01168
01169 mtflags(nd,(mtflags(nd) & ~FREQ4) | FREQ6);
01170
01171 if (mtflags(nd)!=oldndt) {
01172
01173
01174 cerr << "+"<<l+1;
01175
01176 grow(&mtb,mtflags(nd),l+1,msucc(nd),mtablesz(nd),oldndt);
01177 cerr << "\b\b";
01178
01179 subnd=(char *)
01180 search(&mtb,
01181 mtflags(nd),
01182 l+1,
01183 msucc(nd),
01184 mtablesz(nd),
01185 ng.wordp(maxlev-l),
01186 FIND,&found);
01187 }
01188
01189
01190 mtable(nd,mtb);
01191 ndt=mtflags(nd);
01192 nd=subnd;
01193 }
01194
01195 freq(nd, ndt, freq(nd,ndt) + ng.freq);
01196
01197 return 1;
01198 }
01199
01200
01201
01202 int ngramtable::get(ngram& ng,int n,int lev)
01203 {
01204
01205 node nd,subnd;
01206 char *found;
01207 NODETYPE ndt;
01208
01209 assert(lev <= n && lev <= maxlev && ng.size >= n);
01210
01211 if ((I_FREQ_NUM==0) && (lev < maxlev)) {
01212 cerr << "get: for this type of table ngram cannot be smaller than table size\n";
01213 exit(1);
01214 }
01215
01216
01217 if (ng.wordp(n)) {
01218
01219 nd=tree;
01220 ndt=treeflags;
01221
01222 for (int l=0; l<lev; l++) {
01223
01224 table mtb=mtable(nd);
01225
01226 subnd=(char *)
01227 search(&mtb,
01228 mtflags(nd),
01229 l+1,
01230 msucc(nd),
01231 mtablesz(nd),
01232 ng.wordp(n-l),
01233 FIND,&found);
01234
01235 ndt=mtflags(nd);
01236 nd=subnd;
01237
01238 if (nd==0) return 0;
01239 }
01240
01241 ng.size=n;
01242 ng.freq=freq(nd,ndt);
01243 ng.link=nd;
01244 ng.lev=lev;
01245 ng.pinfo=ndt;
01246
01247 if (lev<maxlev) {
01248 ng.succ=msucc(nd);
01249 ng.info=mtflags(nd);
01250 } else {
01251 ng.succ=0;
01252 ng.info=LNODE;
01253 }
01254 return 1;
01255 }
01256 return 0;
01257 }
01258
01259
01260 int ngramtable::scan(node nd,NODETYPE ,int lev,ngram& ng,ACTION action,int maxl)
01261 {
01262
01263 assert(lev<=maxlev);
01264
01265 if ((I_FREQ_NUM==0) && (maxl < maxlev)) {
01266 cerr << "scan: ngram cannot be smaller than LEAFPROB table\n";
01267 exit(1);
01268 }
01269
01270
01271 if (maxl==-1) maxl=maxlev;
01272
01273 ng.size=maxl;
01274
01275 switch (action) {
01276
01277
01278 case INIT:
01279
01280
01281 for (int l=0; l<=maxlev; l++) ng.midx[l]=0;
01282
01283 return 1;
01284
01285 case CONT:
01286
01287 if (lev>(maxl-1)) return 0;
01288
01289 if (ng.midx[lev]<msucc(nd)) {
01290
01291 *ng.wordp(maxl-lev)=
01292 word(mtable(nd)+ng.midx[lev] * mtablesz(nd));
01293
01294
01295
01296
01297 if (lev<(maxl-1)) {
01298 if (scan(mtable(nd) + ng.midx[lev] * mtablesz(nd),
01299 INODE,
01300 lev+1,ng,CONT,maxl))
01301 return 1;
01302 else {
01303 ng.midx[lev]++;
01304 for (int l=lev+1; l<=maxlev; l++) ng.midx[l]=0;
01305
01306 return scan(nd,INODE,lev,ng,CONT,maxl);
01307 }
01308 } else {
01309
01310
01311 *ng.wordp(maxl-lev)=
01312 word(mtable(nd)+ng.midx[lev] * mtablesz(nd));
01313
01314 ng.freq=freq(mtable(nd)+ ng.midx[lev] * mtablesz(nd),mtflags(nd));
01315 ng.pinfo=mtflags(nd);
01316
01317 if (maxl<maxlev) {
01318 ng.info=mtflags(mtable(nd)+ ng.midx[lev] * mtablesz(nd));
01319 ng.link=mtable(nd)+ng.midx[lev] * mtablesz(nd);
01320 ng.succ=msucc(mtable(nd)+ ng.midx[lev] * mtablesz(nd));
01321 } else {
01322 ng.info=LNODE;
01323 ng.link=NULL;
01324 ng.succ=0;
01325 }
01326
01327 ng.midx[lev]++;
01328
01329 return 1;
01330 }
01331 } else
01332 return 0;
01333
01334 default:
01335 cerr << "scan: not supported action\n";
01336 break;
01337
01338 }
01339 return 0;
01340 }
01341
01342
01343 void ngramtable::freetree(node nd)
01344 {
01345 int m=msucc(nd);
01346 int msz=mtablesz(nd);
01347 int truem=comptbsize(m);
01348
01349 if (mtflags(nd) & INODE)
01350 for (int i=0; i<m; i++)
01351 freetree(mtable(nd) + i * msz);
01352 mem->free(mtable(nd),msz*truem);
01353 }
01354
01355
01356 ngramtable::~ngramtable()
01357 {
01358 freetree(tree);
01359 delete [] tree;
01360 delete mem;
01361 delete [] memory;
01362 delete [] occupancy;
01363 delete [] mentr;
01364 delete dict;
01365 };
01366
01367 void ngramtable::stat(int level)
01368 {
01369 int totmem=0;
01370 int totwaste=0;
01371 float mega=1024 * 1024;
01372
01373 cout.precision(2);
01374
01375 cout << "ngramtable class statistics\n";
01376
01377 cout << "levels " << maxlev << "\n";
01378 for (int l=0; l<=maxlev; l++) {
01379 cout << "lev " << l
01380 << " entries "<< mentr[l]
01381 << " allocated mem " << memory[l]/mega << "Mb "
01382 << " used mem " << occupancy[l]/mega << "Mb \n";
01383 totmem+=memory[l];
01384 totwaste+=(memory[l]-occupancy[l]);
01385 }
01386
01387 cout << "total allocated mem " << totmem/mega << "Mb ";
01388 cout << "wasted mem " << totwaste/mega << "Mb\n\n\n";
01389
01390 if (level >1 ) dict->stat();
01391
01392 cout << "\n\n";
01393
01394 if (level >2) mem->stat();
01395
01396 }
01397
01398
01399 double ngramtable::prob(ngram ong)
01400 {
01401
01402 if (ong.size==0) return 0.0;
01403 if (ong.size>maxlev) ong.size=maxlev;
01404
01405 assert(tbtype()==LEAFPROB && ong.size<=maxlev);
01406
01407 ngram ng(dict);
01408 ng.trans(ong);
01409
01410 double bo;
01411
01412 ng.size=maxlev;
01413 for (int s=ong.size+1; s<=maxlev; s++)
01414 *ng.wordp(s)=du_code;
01415
01416 if (get(ng)) {
01417
01418 if (ong.size>1 && resolution<10000000)
01419 return (double)pow(decay,(resolution-ng.freq));
01420 else
01421 return (double)(ng.freq+1)/10000000.0;
01422
01423 } else {
01424
01425 bo_state(1);
01426
01427 *ng.wordp(1)=bo_code;
01428
01429 if (get(ng))
01430
01431 bo=resolution<10000000
01432 ?(double)pow(decay,(resolution-ng.freq))
01433 :(double)(ng.freq+1)/10000000.0;
01434
01435 else
01436 bo=1.0;
01437
01438 ong.size--;
01439
01440 return bo * prob(ong);
01441 }
01442 }
01443
01444
01445 bool ngramtable::check_dictsize_bound()
01446 {
01447 if (dict->size() >= code_range[CODESIZE]) {
01448 cerr
01449 << "dictionary size overflows code range "
01450 << code_range[CODESIZE] << "\n";
01451 exit(1);
01452 }
01453 return true;
01454 }
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466
01467
01468
01469
01470
01471