00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <stdio.h>
00024 #include <cstdlib>
00025 #include <stdlib.h>
00026 #include <fcntl.h>
00027 #include <iostream>
00028 #include <fstream>
00029 #include <stdexcept>
00030 #include <string>
00031 #include <set>
00032 #include <cassert>
00033 #include "math.h"
00034 #include "mempool.h"
00035 #include "htable.h"
00036 #include "ngramcache.h"
00037 #include "dictionary.h"
00038 #include "n_gram.h"
00039 #include "lmContainer.h"
00040 #include "lmtable.h"
00041
00042 #include "util.h"
00043
00044
00045 #define NOPROB ((float)-1.329227995784915872903807060280344576e36)
00046
00047 using namespace std;
00048
00049 inline void error(const char* message)
00050 {
00051 std::cerr << message << "\n";
00052 throw std::runtime_error(message);
00053 }
00054
00055 void print(prob_and_state_t* pst, std::ostream& out)
00056 {
00057 if (pst != NULL) {
00058 out << "PST [";
00059 out << "logpr:" << pst->logpr;
00060 out << ",state:" << (void*) pst->state;
00061 out << ",statesize:" << pst->statesize;
00062 out << ",bow:" << pst->bow;
00063 out << ",bol:" << pst->bol;
00064 out << "]";
00065 out << std::endl;
00066 } else {
00067 out << "PST [NULL]" << std::endl;
00068 }
00069 }
00070
00071
00072 lmtable::lmtable(float nlf, float dlf):lmContainer()
00073 {
00074 ngramcache_load_factor = nlf;
00075 dictionary_load_factor = dlf;
00076 isInverted=false;
00077 configure(1,false);
00078
00079 dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
00080 delete_dict=true;
00081
00082 memset(table, 0, sizeof(table));
00083 memset(tableGaps, 0, sizeof(tableGaps));
00084 memset(cursize, 0, sizeof(cursize));
00085 memset(tbltype, 0, sizeof(tbltype));
00086 memset(maxsize, 0, sizeof(maxsize));
00087 memset(tb_offset, 0, sizeof(maxsize));
00088 memset(info, 0, sizeof(info));
00089 memset(NumCenters, 0, sizeof(NumCenters));
00090
00091 max_cache_lev=0;
00092 for (int i=0; i<LMTMAXLEV+1; i++) lmtcache[i]=NULL;
00093 prob_and_state_cache=NULL;
00094
00095 #ifdef TRACE_CACHELM
00096
00097 cacheout=new std::fstream("/tmp/tracecache",std::ios::out);
00098 sentence_id=0;
00099 #endif
00100
00101 memmap=0;
00102 requiredMaxlev=1000;
00103
00104 isPruned=false;
00105 isInverted=false;
00106
00107
00108 for (int i=0; i<=LMTMAXLEV+1; i++) totget[i]=totbsearch[i]=0;
00109
00110 logOOVpenalty=0.0;
00111
00112
00113 setOrderQuery(false);
00114 };
00115
00116 lmtable::~lmtable()
00117 {
00118 delete_caches();
00119
00120 #ifdef TRACE_CACHELM
00121 cacheout->close();
00122 delete cacheout;
00123 #endif
00124
00125 for (int l=1; l<=maxlev; l++) {
00126 if (table[l]) {
00127 if (memmap > 0 && l >= memmap)
00128 Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);
00129 else
00130 delete [] table[l];
00131 }
00132 if (isQtable) {
00133 if (Pcenters[l]) delete [] Pcenters[l];
00134 if (l<maxlev)
00135 if (Bcenters[l]) delete [] Bcenters[l];
00136 }
00137 }
00138
00139 if (delete_dict) delete dict;
00140 };
00141
00142 void lmtable::init_prob_and_state_cache()
00143 {
00144 #ifdef PS_CACHE_ENABLE
00145 assert(prob_and_state_cache==NULL);
00146 prob_and_state_cache=new NGRAMCACHE_t(maxlev,sizeof(prob_and_state_t),400000,ngramcache_load_factor);
00147 std::cerr << "creating cache for storing prob, state and statesize of ngrams" << std:: endl;
00148 #endif
00149 }
00150
00151 void lmtable::init_lmtcaches(int uptolev)
00152 {
00153 max_cache_lev=uptolev;
00154 #ifdef LMT_CACHE_ENABLE
00155 for (int i=2; i<=max_cache_lev; i++) {
00156 assert(lmtcache[i]==NULL);
00157 lmtcache[i]=new NGRAMCACHE_t(i,sizeof(char*),200000,ngramcache_load_factor);
00158 std::cerr << "creating cache for storing pointers to the LM for ngram of size " << i << std:: endl;
00159 }
00160 #endif
00161 }
00162
00163 void lmtable::init_caches(int uptolev)
00164 {
00165 init_prob_and_state_cache();
00166 init_lmtcaches(uptolev);
00167 }
00168
00169 void lmtable::delete_prob_and_state_cache()
00170 {
00171 #ifdef PS_CACHE_ENABLE
00172 if (prob_and_state_cache) delete prob_and_state_cache;
00173 prob_and_state_cache=NULL;
00174 std::cerr << "deleting cache for storing prob, state and statesize of ngrams" << std:: endl;
00175 #endif
00176 }
00177
00178 void lmtable::delete_lmtcaches()
00179 {
00180 #ifdef LMT_CACHE_ENABLE
00181 for (int i=2; i<=max_cache_lev; i++) {
00182 if (lmtcache[i]) delete lmtcache[i];
00183 lmtcache[i]=NULL;
00184 std::cerr << "deleting cache for storing pointers to the LM for ngram of size " << i << std:: endl;
00185 }
00186 #endif
00187 }
00188
00189 void lmtable::delete_caches()
00190 {
00191 delete_prob_and_state_cache();
00192 delete_lmtcaches();
00193 }
00194
00195
00196 void lmtable::used_prob_and_state_cache()
00197 {
00198 #ifdef PS_CACHE_ENABLE
00199 std::cerr << "prob_and_state_cache() ";
00200 if (prob_and_state_cache) prob_and_state_cache->used();
00201 #endif
00202 }
00203
00204 void lmtable::used_lmtcaches()
00205 {
00206 #ifdef LMT_CACHE_ENABLE
00207 for (int i=2; i<=max_cache_lev; i++) {
00208 std::cerr << "lmtcaches with order " << i << " ";
00209 if (lmtcache[i]) lmtcache[i]->used();
00210 }
00211 #endif
00212 }
00213
00214 void lmtable::used_caches()
00215 {
00216 used_prob_and_state_cache();
00217 used_lmtcaches();
00218 }
00219
00220
00221 void lmtable::check_prob_and_state_cache_levels()
00222 {
00223 #ifdef PS_CACHE_ENABLE
00224 if (prob_and_state_cache && prob_and_state_cache->isfull())
00225 prob_and_state_cache->reset(prob_and_state_cache->cursize());
00226 #endif
00227 }
00228
00229 void lmtable::check_lmtcaches_levels()
00230 {
00231 #ifdef LMT_CACHE_ENABLE
00232 for (int i=2; i<=max_cache_lev; i++)
00233 if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize());
00234 #endif
00235 }
00236
00237 void lmtable::check_caches_levels()
00238 {
00239 check_prob_and_state_cache_levels();
00240 check_lmtcaches_levels();
00241 }
00242
00243 void lmtable::reset_prob_and_state_cache()
00244 {
00245 #ifdef PS_CACHE_ENABLE
00246 if (prob_and_state_cache)
00247 prob_and_state_cache->reset(MAX(prob_and_state_cache->cursize(),prob_and_state_cache->maxsize()));
00248 #endif
00249 }
00250
00251 void lmtable::reset_lmtcaches()
00252 {
00253 #ifdef LMT_CACHE_ENABLE
00254 for (int i=2; i<=max_cache_lev; i++)
00255 lmtcache[i]->reset(MAX(lmtcache[i]->cursize(),lmtcache[i]->maxsize()));
00256 #endif
00257 }
00258
00259 void lmtable::reset_caches()
00260 {
00261 reset_prob_and_state_cache();
00262 reset_lmtcaches();
00263 }
00264
00265 bool lmtable::are_prob_and_state_cache_active()
00266 {
00267 #ifdef PS_CACHE_ENABLE
00268 return prob_and_state_cache!=NULL;
00269 #else
00270 return false;
00271 #endif
00272 }
00273
00274 bool lmtable::are_lmtcaches_active()
00275 {
00276 #ifdef LMT_CACHE_ENABLE
00277 if (max_cache_lev < 2)
00278 return false;
00279 for (int i=2; i<=max_cache_lev; i++)
00280 if (lmtcache[i]==NULL) return false;
00281 return true;
00282 #else
00283 return false;
00284 #endif
00285 }
00286
00287 bool lmtable::are_caches_active()
00288 {
00289 return (are_prob_and_state_cache_active() && are_lmtcaches_active());
00290 }
00291
00292 void lmtable::configure(int n,bool quantized)
00293 {
00294 VERBOSE(2,"void lmtable::configure(int n,bool quantized) with n:" << n << std::endl);
00295 maxlev=n;
00296 VERBOSE(2," maxlev:" << maxlev << " maxlevel():" << maxlevel() << " this->maxlevel():" << this->maxlevel() << std::endl);
00297
00298
00299 for (int i=0; i<n; i++) tbltype[i]=(quantized?QINTERNAL:INTERNAL);
00300 tbltype[n]=(quantized?QLEAF:LEAF);
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310 }
00311
00312
00313 void lmtable::load(const std::string infile, int mmap)
00314 {
00315 VERBOSE(2,"lmtable::load(const std::string filename, int mmap)" << std::endl);
00316 VERBOSE(2,"Reading " << infile << "..." << std::endl);
00317 inputfilestream inp(infile.c_str());
00318
00319 if (!inp.good()) {
00320 std::cerr << "Failed to open " << infile << "!" << std::endl;
00321 exit(1);
00322 }
00323 setMaxLoadedLevel(requiredMaxlev);
00324
00325
00326 if (infile.compare(infile.size()-3,3,".mm")==0) {
00327 mmap=1;
00328 }
00329
00330 if (mmap>0) {
00331 #ifdef WIN32
00332 mmap=0;
00333 #endif
00334 }
00335
00336 load(inp,infile.c_str(),NULL,mmap,NONE);
00337 getDict()->incflag(0);
00338 }
00339
00340 void lmtable::load(istream& inp,const char* filename,const char* outfilename,int keep_on_disk, OUTFILE_TYPE )
00341 {
00342
00343 VERBOSE(2,"lmtable::load(istream& inp,...)" << std::endl);
00344
00345 #ifdef WIN32
00346 if (keep_on_disk>0) {
00347 std::cerr << "lmtable::load memory mapping not yet available under WIN32\n";
00348 keep_on_disk = 0;
00349 }
00350 #endif
00351
00352
00353 char header[MAX_LINE];
00354 inp >> header;
00355 std::cerr << header << "\n";
00356
00357 if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0) {
00358 loadbin(inp,header,filename,keep_on_disk);
00359 } else {
00360
00361 if (keep_on_disk && outfilename==NULL) {
00362 cerr << "Load Error: inconsistent setting. Passed input file: textual. Memory map: yes. Outfilename: not specified.\n";
00363 exit(0);
00364 }
00365
00366 loadtxt(inp,header,outfilename,keep_on_disk);
00367 }
00368
00369 cerr << "OOV code is " << lmtable::getDict()->oovcode() << "\n";
00370 }
00371
00372
00373
00374
00375 int lmtable::reload(std::set<string> words)
00376 {
00377
00378 dictionary dict(NULL,(int)words.size());
00379 dict.incflag(1);
00380
00381 std::set<string>::iterator w;
00382 for (w = words.begin(); w != words.end(); ++w)
00383 dict.encode((*w).c_str());
00384
00385 return 1;
00386 }
00387
00388
00389
00390 void lmtable::load_centers(istream& inp,int Order)
00391 {
00392 char line[MAX_LINE];
00393
00394
00395 cerr << Order << " read code book ";
00396 inp >> NumCenters[Order];
00397 Pcenters[Order]=new float[NumCenters[Order]];
00398 Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL);
00399
00400 for (int c=0; c<NumCenters[Order]; c++) {
00401 inp >> Pcenters[Order][c];
00402 if (Order<maxlev) inp >> Bcenters[Order][c];
00403 };
00404
00405 inp.getline((char*)line,MAX_LINE);
00406 }
00407
00408 void lmtable::loadtxt(istream& inp,const char* header,const char* outfilename,int mmap)
00409 {
00410 if (mmap>0)
00411 loadtxt_mmap(inp,header,outfilename);
00412 else {
00413 loadtxt_ram(inp,header);
00414 lmtable::getDict()->genoovcode();
00415 }
00416 }
00417
00418 void lmtable::loadtxt_mmap(istream& inp,const char* header,const char* outfilename)
00419 {
00420
00421 char nameNgrams[BUFSIZ];
00422 char nameHeader[BUFSIZ];
00423
00424 FILE *fd = NULL;
00425 table_pos_t filesize=0;
00426
00427 int Order,n;
00428
00429 int maxlevel_h;
00430
00431
00432
00433 char line[MAX_LINE];
00434
00435
00436
00437 lmtable::getDict()->incflag(1);
00438
00439
00440 isQtable=(strncmp(header,"qARPA",5)==0?true:false);
00441
00442
00443 isItable=(strncmp(header,"iARPA",5)==0?true:false);
00444
00445 if (isQtable) {
00446
00447 inp >> line;
00448 if (!(maxlevel_h=atoi(line))) {
00449 cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n";
00450 exit(1);
00451 }
00452
00453 for (n=1; n<=maxlevel_h; n++) {
00454 inp >> line;
00455 if (!(NumCenters[n]=atoi(line))) {
00456 cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n";
00457 exit(0);
00458 }
00459 }
00460 }
00461
00462
00463 bool yetconfigured=false;
00464
00465 cerr << "loadtxtmmap()\n";
00466
00467
00468
00469 while (inp.getline(line,MAX_LINE)) {
00470
00471 if (strlen(line)==MAX_LINE-1) {
00472 cerr << "lmtable::loadtxt_mmap: input line exceed MAXLINE ("
00473 << MAX_LINE << ") chars " << line << "\n";
00474 exit(1);
00475 }
00476
00477 bool backslash = (line[0] == '\\');
00478
00479 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00480 maxsize[Order] = n;
00481 maxlev=Order;
00482 cerr << "size[" << Order << "]=" << maxsize[Order] << "\n";
00483 }
00484
00485 VERBOSE(2,"maxlev" << maxlev << std::endl);
00486 if (maxlev>requiredMaxlev) maxlev=requiredMaxlev;
00487 VERBOSE(2,"maxlev" << maxlev << std::endl);
00488 VERBOSE(2,"lmtable:requiredMaxlev" << requiredMaxlev << std::endl);
00489
00490 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00491
00492
00493 if (!yetconfigured) {
00494 configure(maxlev,isQtable);
00495 yetconfigured=true;
00496
00497
00498 strcpy(nameNgrams,outfilename);
00499 strcat(nameNgrams, "-ngrams");
00500
00501 fd = fopen(nameNgrams, "w+");
00502
00503
00504 for (int l=1; l<=maxlev; l++) {
00505 if (l<maxlev)
00506 filesize += (table_pos_t) maxsize[l] * nodesize(tbltype[l]) + 2 * NumCenters[l] * sizeof(float);
00507 else
00508 filesize += (table_pos_t) maxsize[l] * nodesize(tbltype[l]) + NumCenters[l] * sizeof(float);
00509 }
00510
00511
00512 ftruncate(fileno(fd),filesize);
00513 table[0]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[0]));
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523 for (int l=1; l<=maxlev; l++) {
00524 if (l<maxlev)
00525 table[l]=(char *)(table[l-1] + (table_pos_t) maxsize[l-1]*nodesize(tbltype[l-1]) +
00526 2 * NumCenters[l] * sizeof(float));
00527 else
00528 table[l]=(char *)(table[l-1] + (table_pos_t) maxsize[l-1]*nodesize(tbltype[l-1]) +
00529 NumCenters[l] * sizeof(float));
00530
00531 cerr << "table[" << l << "]-table[" << l-1 << "]="
00532 << (table_pos_t) table[l]-(table_pos_t) table[l-1] << " (nodesize=" << nodesize(tbltype[l-1]) << ")\n";
00533 }
00534 }
00535
00536 loadtxt_level(inp,Order);
00537
00538 if (isQtable) {
00539
00540 if (Order<maxlev) {
00541 memcpy(table[Order] - 2 * NumCenters[Order] * sizeof(float),
00542 Pcenters[Order],
00543 NumCenters[Order] * sizeof(float));
00544 memcpy(table[Order] - NumCenters[Order] * sizeof(float),
00545 Bcenters[Order],
00546 NumCenters[Order] * sizeof(float));
00547 } else {
00548 memcpy(table[Order] - NumCenters[Order] * sizeof(float),
00549 Pcenters[Order],
00550 NumCenters[Order] * sizeof(float));
00551 }
00552 }
00553
00554 msync(table[0],filesize,MS_SYNC);
00555
00556
00557
00558 if (maxlev>1 && Order>1) {
00559 checkbounds(Order-1);
00560 delete startpos[Order-1];
00561 }
00562 }
00563 }
00564
00565 cerr << "closing output file: " << nameNgrams << "\n";
00566 for (int i=1; i<=maxlev; i++) {
00567 if (maxsize[i] != cursize[i]) {
00568 for (int l=1; l<=maxlev; l++)
00569 cerr << "Level " << l << ": starting ngrams=" << maxsize[l] << " - actual stored ngrams=" << cursize[l] << "\n";
00570 break;
00571 }
00572 }
00573
00574 Munmap(table[0],filesize,MS_SYNC);
00575 for (int l=1; l<=maxlev; l++)
00576 table[l]=0;
00577 cerr << "running fclose...\n";
00578 fclose(fd);
00579 cerr << "done\n";
00580
00581 lmtable::getDict()->incflag(0);
00582 lmtable::getDict()->genoovcode();
00583
00584
00585
00586 strcpy(nameHeader,outfilename);
00587 strcat(nameHeader, "-header");
00588 VERBOSE(2,"saving header+dictionary in " << nameHeader << "\n");
00589 fstream out(nameHeader,ios::out);
00590
00591
00592 if (isQtable) {
00593 out << "Qblmt" << (isInverted?"I ":" ") << maxlev;
00594 for (int i=1; i<=maxlev; i++) out << " " << maxsize[i];
00595 out << "\nNumCenters";
00596 for (int i=1; i<=maxlev; i++) out << " " << NumCenters[i];
00597 out << "\n";
00598
00599 } else {
00600 out << "blmt" << (isInverted?"I ":" ") << maxlev;
00601 for (int i=1; i<=maxlev; i++) out << " " << maxsize[i];
00602 out << "\n";
00603 }
00604
00605 lmtable::getDict()->save(out);
00606
00607 out.close();
00608 cerr << "done\n";
00609
00610
00611
00612 char cmd[BUFSIZ];
00613 sprintf(cmd,"cat %s >> %s", nameNgrams, nameHeader);
00614 cerr << "run cmd <" << cmd << ">\n";
00615 system(cmd);
00616
00617 sprintf(cmd,"mv %s %s", nameHeader, outfilename);
00618 cerr << "run cmd <" << cmd << ">\n";
00619 system(cmd);
00620
00621 removefile(nameNgrams);
00622
00623
00624 exit(0);
00625 return;
00626 }
00627
00628
00629 void lmtable::loadtxt_ram(istream& inp,const char* header)
00630 {
00631
00632 char line[MAX_LINE];
00633
00634
00635 lmtable::getDict()->incflag(1);
00636
00637
00638 isQtable=(strncmp(header,"qARPA",5)==0?true:false);
00639
00640
00641 isItable=(strncmp(header,"iARPA",5)==0?true:false);
00642
00643
00644 bool yetconfigured=false;
00645
00646 cerr << "loadtxt_ram()\n";
00647
00648
00649 int Order,n;
00650
00651 while (inp.getline(line,MAX_LINE)) {
00652 if (strlen(line)==MAX_LINE-1) {
00653 cerr << "lmtable::loadtxt_ram: input line exceed MAXLINE ("
00654 << MAX_LINE << ") chars " << line << "\n";
00655 exit(1);
00656 }
00657
00658 bool backslash = (line[0] == '\\');
00659
00660 if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
00661 maxsize[Order] = n;
00662 maxlev=Order;
00663 }
00664
00665 if (maxlev>requiredMaxlev) maxlev=requiredMaxlev;
00666
00667 if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
00668
00669
00670 if (!yetconfigured) {
00671 configure(maxlev,isQtable);
00672 yetconfigured=true;
00673
00674 for (int i=1; i<=maxlev; i++)
00675 table[i] = new char[(table_pos_t) maxsize[i] * nodesize(tbltype[i])];
00676 }
00677
00678 loadtxt_level(inp,Order);
00679
00680
00681
00682
00683
00684 if (maxlev>1 && Order>1) {
00685
00686
00687
00688 checkbounds(Order-1);
00689
00690
00691
00692
00693 }
00694 }
00695 }
00696
00697 lmtable::getDict()->incflag(0);
00698 cerr << "done\n";
00699 }
00700
00701 void lmtable::loadtxt_level(istream& inp, int level)
00702 {
00703 cerr << level << "-grams: reading ";
00704
00705 if (isQtable) {
00706 load_centers(inp,level);
00707 }
00708
00709
00710 if (maxlev>1 && level<maxlev) {
00711 startpos[level]=new table_entry_pos_t[maxsize[level]];
00712 for (table_entry_pos_t c=0; c<maxsize[level]; c++) {
00713 startpos[level][c]=BOUND_EMPTY1;
00714 }
00715 }
00716
00717
00718 cerr << maxsize[level] << " entries\n";
00719
00720 float prob,bow;
00721
00722
00723 ngram ng(lmtable::getDict());
00724 ngram ing(lmtable::getDict());
00725
00726
00727 for (table_entry_pos_t c=0; c<maxsize[level]; c++) {
00728
00729 if (parseline(inp,level,ng,prob,bow)) {
00730
00731
00732 if (isInverted & level>1) {
00733 ing.invert(ng);
00734 ng=ing;
00735 }
00736
00737
00738
00739 if (isItable && level>1) {
00740
00741 get(ng,ng.size,ng.size-1);
00742 float rbow=0.0;
00743 if (ng.lev==ng.size-1) {
00744 rbow=ng.bow;
00745 }
00746
00747 int tmp=maxlev;
00748 maxlev=level-1;
00749 prob= log(exp((double)prob * M_LN10) + exp(((double)rbow + lprob(ng)) * M_LN10))/M_LN10;
00750 maxlev=tmp;
00751 }
00752
00753
00754 if (isQtable) add(ng, (qfloat_t)prob, (qfloat_t)bow);
00755 else add(ng, prob, bow);
00756 }
00757 }
00758 cerr << "done level " << level << "\n";
00759 }
00760
00761
00762 void lmtable::expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap)
00763 {
00764
00765 if (mmap>0)
00766 expand_level_mmap(level, size, outfilename);
00767 else {
00768 expand_level_nommap(level, size);
00769 }
00770 }
00771
00772 void lmtable::expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename)
00773 {
00774 maxsize[level]=size;
00775
00776
00777 char nameNgrams[BUFSIZ];
00778 sprintf(nameNgrams,"%s-%dgrams",outfilename,level);
00779
00780
00781
00782
00783 FILE *fd = NULL;
00784 fd = fopen(nameNgrams, "w+");
00785 if (fd == NULL) {
00786 perror("Error opening file for writing");
00787 exit(EXIT_FAILURE);
00788 }
00789 table_pos_t filesize=(table_pos_t) maxsize[level] * nodesize(tbltype[level]);
00790
00791 ftruncate(fileno(fd),filesize);
00792
00793
00794
00795 table[level]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[level]));
00796 if (table[level] == MAP_FAILED) {
00797 fclose(fd);
00798 perror("Error mmapping the file");
00799 exit(EXIT_FAILURE);
00800 }
00801
00802 if (maxlev>1 && level<maxlev) {
00803 startpos[level]=new table_entry_pos_t[maxsize[level]];
00804 LMT_TYPE ndt=tbltype[level];
00805 int ndsz=nodesize(ndt);
00806 char *found = table[level];
00807 for (table_entry_pos_t c=0; c<maxsize[level]; c++) {
00808 startpos[level][c]=BOUND_EMPTY1;
00809 found += ndsz;
00810
00811 }
00812 }
00813 }
00814
00815 void lmtable::expand_level_nommap(int level, table_entry_pos_t size)
00816 {
00817 VERBOSE(2,"lmtable::expand_level_nommap START Level:" << level << endl);
00818 maxsize[level]=size;
00819
00820 table[level] = new char[(table_pos_t) maxsize[level] * nodesize(tbltype[level])];
00821 if (maxlev>1 && level<maxlev) {
00822 startpos[level]=new table_entry_pos_t[maxsize[level]];
00823 LMT_TYPE ndt=tbltype[level];
00824 int ndsz=nodesize(ndt);
00825 char *found = table[level];
00826 for (table_entry_pos_t c=0; c<maxsize[level]; c++) {
00827 startpos[level][c]=BOUND_EMPTY1;
00828 found += ndsz;
00829
00830 }
00831 }
00832 VERBOSE(2,"lmtable::expand_level_nommap END Level:" << level << endl);
00833 }
00834
00835 void lmtable::printTable(int level)
00836 {
00837 char* tbl=table[level];
00838 LMT_TYPE ndt=tbltype[level];
00839 int ndsz=nodesize(ndt);
00840 table_entry_pos_t printEntryN=getCurrentSize(level);
00841
00842
00843
00844 cout << "level = " << level << " of size:" << printEntryN <<" ndsz:" << ndsz << " \n";
00845
00846
00847 float p,bw;
00848 table_entry_pos_t bnd, start;
00849
00850 if (level<maxlev){
00851 for (table_entry_pos_t c=0; c<printEntryN; c++) {
00852 p=prob(tbl,ndt);
00853 bw=bow(tbl,ndt);
00854 bnd=bound(tbl,ndt);
00855 start=startpos[level][c];
00856
00857 cerr << p << " " << word(tbl) << " -> " << dict->decode(word(tbl)) << " bw:" << bw << " bnd:" << bnd << " " << start << " tb_offset:" << tb_offset[level+1] << "\n";
00858
00859 tbl+=ndsz;
00860 }
00861 }else{
00862 for (table_entry_pos_t c=0; c<printEntryN; c++) {
00863 p=prob(tbl,ndt);
00864
00865 cerr << p << " " << word(tbl) << " -> " << dict->decode(word(tbl)) << "\n";
00866
00867 tbl+=ndsz;
00868 }
00869 }
00870 return;
00871 }
00872
00873
00874 void lmtable::checkbounds(int level)
00875 {
00876 VERBOSE(2,"lmtable::checkbounds START Level:" << level << endl);
00877
00878 if (getCurrentSize(level) > 0 ){
00879
00880 char* tbl=table[level];
00881 char* succtbl=table[level+1];
00882
00883 LMT_TYPE ndt=tbltype[level];
00884 LMT_TYPE succndt=tbltype[level+1];
00885 int ndsz=nodesize(ndt);
00886 int succndsz=nodesize(succndt);
00887
00888
00889
00890
00891 std::string filePath;
00892
00893 mfstream out;
00894 createtempfile(out, filePath, ios::out|ios::binary);
00895
00896 if (out.fail())
00897 {
00898 perror("checkbound creating out on filePath");
00899 exit(4);
00900 }
00901
00902 table_entry_pos_t start,end,newend;
00903 table_entry_pos_t succ;
00904
00905
00906 char* found;
00907 for (table_entry_pos_t c=0; c<cursize[level]; c++) {
00908 found=tbl+(table_pos_t) c*ndsz;
00909 start=startpos[level][c];
00910 end=boundwithoffset(found,ndt,level);
00911
00912 if (c>0) newend=boundwithoffset(found-ndsz,ndt,level);
00913 else newend=0;
00914
00915
00916 if (start==BOUND_EMPTY1){
00917 succ=0;
00918 }
00919 else{
00920 assert(end>start);
00921 succ=end-start;
00922 }
00923
00924 startpos[level][c]=newend;
00925 newend += succ;
00926
00927 assert(newend<=cursize[level+1]);
00928
00929
00930 if (succ>0) {
00931
00932 out.write((char*)(succtbl + (table_pos_t) start * succndsz),(table_pos_t) succ * succndsz);
00933 if (!out.good()) {
00934 std::cerr << " Something went wrong while writing temporary file " << filePath
00935 << " Maybe there is not enough space on this filesystem\n";
00936
00937 out.close();
00938 exit(2);
00939 removefile(filePath);
00940 }
00941 }
00942
00943 boundwithoffset(found,ndt,newend,level);
00944 }
00945 out.close();
00946 if (out.fail())
00947 {
00948 perror("error closing out");
00949 exit(4);
00950 }
00951
00952 fstream inp(filePath.c_str(),ios::in|ios::binary);
00953 if (inp.fail())
00954 {
00955 perror("error opening inp");
00956 exit(4);
00957 }
00958
00959 inp.read(succtbl,(table_pos_t) cursize[level+1]*succndsz);
00960 inp.close();
00961 if (inp.fail())
00962 {
00963 perror("error closing inp");
00964 exit(4);
00965 }
00966
00967 removefile(filePath);
00968 }
00969 VERBOSE(2,"lmtable::checkbounds END Level:" << level << endl);
00970 }
00971
00972
00973
00974
00975 int lmtable::addwithoffset(ngram& ng, float iprob, float ibow)
00976 {
00977 char *found;
00978 LMT_TYPE ndt=tbltype[1];
00979 int ndsz=nodesize(ndt);
00980 static int no_more_msg = 0;
00981
00982 if (ng.size>1) {
00983
00984
00985 table_entry_pos_t start=0;
00986 table_entry_pos_t end=cursize[1];
00987 table_entry_pos_t position;
00988
00989 for (int l=1; l<ng.size; l++) {
00990
00991 ndt=tbltype[l];
00992 ndsz=nodesize(ndt);
00993
00994 if (search(l,start,(end-start),ndsz, ng.wordp(ng.size-l+1),LMT_FIND, &found)) {
00995
00996
00997 if (l < (ng.size-1)) {
00998
00999 if (found==table[l]){
01000 start=0;
01001 }
01002 else {
01003 position=(table_entry_pos_t) (((table_pos_t) (found)-(table_pos_t) table[l])/ndsz);
01004 start=startpos[l][position];
01005 }
01006
01007 end=boundwithoffset(found,ndt,l);
01008 }
01009 } else {
01010 if (!no_more_msg)
01011 cerr << "warning: missing back-off (at level " << l << ") for ngram " << ng << " (and possibly for others)\n";
01012
01013 no_more_msg++;
01014 if (!(no_more_msg % 5000000))
01015 cerr << "!";
01016
01017 return 0;
01018 }
01019 }
01020
01021
01022 position=(table_entry_pos_t) (((table_pos_t) found-(table_pos_t) table[ng.size-1])/ndsz);
01023
01024
01025 if (startpos[ng.size-1][position]==BOUND_EMPTY1)
01026 startpos[ng.size-1][position]=cursize[ng.size];
01027
01028
01029 boundwithoffset(found,ndt,cursize[ng.size]+1,ng.size-1);
01030 }
01031
01032
01033
01034 assert(cursize[ng.size]< maxsize[ng.size]);
01035 ndt=tbltype[ng.size];
01036 ndsz=nodesize(ndt);
01037
01038 found=table[ng.size] + ((table_pos_t) cursize[ng.size] * ndsz);
01039 word(found,*ng.wordp(1));
01040 prob(found,ndt,iprob);
01041 if (ng.size<maxlev) {
01042
01043 table_entry_pos_t newend;
01044 if (found==table[ng.size]) newend=0;
01045 else newend=boundwithoffset(found - ndsz,ndt,ng.size);
01046
01047 bow(found,ndt,ibow);
01048 boundwithoffset(found,ndt,newend,ng.size);
01049 }
01050 cursize[ng.size]++;
01051
01052 if (!(cursize[ng.size]%5000000))
01053 cerr << ".";
01054
01055 return 1;
01056
01057 };
01058
01059
01060
01061
01062
01063 int lmtable::add(ngram& ng, float iprob, float ibow)
01064 {
01065 char *found;
01066 LMT_TYPE ndt=tbltype[1];
01067 int ndsz=nodesize(ndt);
01068 static int no_more_msg = 0;
01069
01070 if (ng.size>1) {
01071
01072
01073 table_entry_pos_t start=0;
01074 table_entry_pos_t end=cursize[1];
01075 table_entry_pos_t position;
01076
01077 for (int l=1; l<ng.size; l++) {
01078
01079 ndt=tbltype[l];
01080 ndsz=nodesize(ndt);
01081
01082 if (search(l,start,(end-start),ndsz, ng.wordp(ng.size-l+1),LMT_FIND, &found)) {
01083
01084
01085 if (l < (ng.size-1)) {
01086
01087 if (found==table[l]){
01088 start=0;
01089 }
01090 else {
01091 position=(table_entry_pos_t) (((table_pos_t) (found)-(table_pos_t) table[l])/ndsz);
01092 start=startpos[l][position];
01093 }
01094
01095 end=bound(found,ndt);
01096 }
01097 }
01098 else {
01099 if (!no_more_msg)
01100 cerr << "warning: missing back-off (at level " << l << ") for ngram " << ng << " (and possibly for others)\n";
01101
01102 no_more_msg++;
01103 if (!(no_more_msg % 5000000))
01104 cerr << "!";
01105
01106 return 0;
01107 }
01108 }
01109
01110
01111 position=(table_entry_pos_t) (((table_pos_t) found-(table_pos_t) table[ng.size-1])/ndsz);
01112
01113
01114 if (startpos[ng.size-1][position]==BOUND_EMPTY1)
01115 startpos[ng.size-1][position]=cursize[ng.size];
01116
01117
01118 bound(found,ndt,cursize[ng.size]+1);
01119 }
01120
01121
01122
01123 assert(cursize[ng.size]< maxsize[ng.size]);
01124 ndt=tbltype[ng.size];
01125 ndsz=nodesize(ndt);
01126
01127 found=table[ng.size] + ((table_pos_t) cursize[ng.size] * ndsz);
01128 word(found,*ng.wordp(1));
01129 prob(found,ndt,iprob);
01130 if (ng.size<maxlev) {
01131
01132 table_entry_pos_t newend;
01133 if (found==table[ng.size]) newend=0;
01134 else newend=bound(found - ndsz,ndt);
01135
01136 bow(found,ndt,ibow);
01137 bound(found,ndt,newend);
01138 }
01139
01140 cursize[ng.size]++;
01141
01142 if (!(cursize[ng.size]%5000000))
01143 cerr << ".";
01144
01145 return 1;
01146
01147 };
01148
01149
01150 void *lmtable::search(int lev,
01151 table_entry_pos_t offs,
01152 table_entry_pos_t n,
01153 int sz,
01154 int *ngp,
01155 LMT_ACTION action,
01156 char **found)
01157 {
01158
01159
01160
01161
01162
01163
01164
01165
01166 if (lev==1) return *found=(*ngp < (float) n ? table[1] + (table_pos_t)*ngp * sz:NULL);
01167
01168
01169
01170 char* tb;
01171 tb=table[lev] + (table_pos_t) offs * sz;
01172
01173 char w[LMTCODESIZE];
01174 putmem(w,ngp[0],0,LMTCODESIZE);
01175
01176 table_entry_pos_t idx=0;
01177 *found=NULL;
01178
01179 totbsearch[lev]++;
01180 switch(action) {
01181 case LMT_FIND:
01182
01183
01184 if (!tb || !mybsearch(tb,n,sz,w,&idx)) {
01185 return NULL;
01186 } else {
01187
01188 return *found=tb + ((table_pos_t)idx * sz);
01189 }
01190 default:
01191 error((char*)"lmtable::search: this option is available");
01192 };
01193 return NULL;
01194 }
01195
01196
01197
01198
01199 int lmtable::mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx)
01200 {
01201 if (n==0) return 0;
01202
01203 *idx=0;
01204 register table_entry_pos_t low=0, high=n;
01205 register unsigned char *p;
01206 int result;
01207
01208 #ifdef INTERP_SEARCH
01209
01210 char *lp=NULL;
01211 char *hp=NULL;
01212
01213 #endif
01214
01215 for (unsigned int i=0;i<n;i++){
01216
01217 unsigned char* tmp = (unsigned char *) (ar + (i * size));
01218
01219 result=codecmp((char *)key,(char *)tmp);
01220 }
01221 while (low < high) {
01222
01223 #ifdef INTERP_SEARCH
01224
01225
01226 if ((high-low)>=10000) {
01227
01228 lp=(char *) (ar + (low * size));
01229 if (codecmp((char *)key,lp)<0) {
01230 *idx=low;
01231 return 0;
01232 }
01233
01234 hp=(char *) (ar + ((high-1) * size));
01235 if (codecmp((char *)key,hp)>0) {
01236 *idx=high;
01237 return 0;
01238 }
01239
01240 *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp);
01241 } else
01242 #endif
01243 *idx = (low + high) / 2;
01244
01245
01246
01247
01248 p = (unsigned char *) (ar + (*idx * size));
01249 result=codecmp((char *)key,(char *)p);
01250
01251 if (result < 0)
01252 high = *idx;
01253
01254 else if (result > 0)
01255 low = ++(*idx);
01256 else
01257 return 1;
01258 }
01259
01260 *idx=low;
01261
01262 return 0;
01263
01264 }
01265
01266
01267
01268
01269 void lmtable::cpsublm(lmtable* slmt, dictionary* subdict,bool keepunigr)
01270 {
01271
01272
01273
01274
01275 slmt->configure(maxlev,isQtable);
01276 slmt->dict=new dictionary((keepunigr?dict:subdict),false);
01277
01278 if (isQtable) {
01279 for (int i=1; i<=maxlev; i++) {
01280 slmt->NumCenters[i]=NumCenters[i];
01281 slmt->Pcenters[i]=new float [NumCenters[i]];
01282 memcpy(slmt->Pcenters[i],Pcenters[i],NumCenters[i] * sizeof(float));
01283
01284 if (i<maxlev) {
01285 slmt->Bcenters[i]=new float [NumCenters[i]];
01286 memcpy(slmt->Bcenters[i],Bcenters[i],NumCenters[i] * sizeof(float));
01287 }
01288 }
01289 }
01290
01291
01292
01293
01294 dict->genoovcode();
01295 slmt->dict->genoovcode();
01296 subdict->genoovcode();
01297
01298 int* lookup=new int [dict->size()];
01299
01300 for (int c=0; c<dict->size(); c++) {
01301 lookup[c]=subdict->encode(dict->decode(c));
01302 if (c != dict->oovcode() && lookup[c] == subdict->oovcode())
01303 lookup[c]=-1;
01304 }
01305
01306
01307 LMT_TYPE ndt,pndt;
01308 int ndsz,pndsz;
01309 char *entry, *newentry;
01310 table_entry_pos_t start, end, origin;
01311
01312 for (int l=1; l<=maxlev; l++) {
01313
01314 slmt->cursize[l]=0;
01315 slmt->table[l]=NULL;
01316
01317 if (l==1) {
01318
01319 ndt=tbltype[l];
01320 ndsz=nodesize(ndt);
01321
01322 for (table_entry_pos_t p=0; p<cursize[l]; p++) {
01323
01324 entry=table[l] + (table_pos_t) p * ndsz;
01325 if (lookup[word(entry)]!=-1 || keepunigr) {
01326
01327 if ((slmt->cursize[l] % slmt->dict->size()) ==0)
01328 slmt->table[l]=(char *)realloc(slmt->table[l],((table_pos_t) slmt->cursize[l] + (table_pos_t) slmt->dict->size()) * ndsz);
01329
01330 newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz;
01331 memcpy(newentry,entry,ndsz);
01332 if (!keepunigr)
01333 slmt->word(newentry,lookup[word(entry)]);
01334
01335 if (l<maxlev)
01336 slmt->bound(newentry,ndt,p);
01337 slmt->cursize[l]++;
01338 }
01339 }
01340 }
01341
01342 else {
01343
01344 pndt=tbltype[l-1];
01345 pndsz=nodesize(pndt);
01346 ndt=tbltype[l];
01347 ndsz=nodesize(ndt);
01348
01349 for (table_entry_pos_t p=0; p<slmt->cursize[l-1]; p++) {
01350
01351
01352 origin=slmt->bound(slmt->table[l-1] + (table_pos_t)p * pndsz,pndt);
01353 if (origin == 0) start=0;
01354 else start=bound(table[l-1] + (table_pos_t)(origin-1) * pndsz,pndt);
01355 end=bound(table[l-1] + (table_pos_t)origin * pndsz,pndt);
01356
01357 if (!keepunigr || lookup[word(table[l-1] + (table_pos_t)origin * pndsz)]!=-1) {
01358 while (start < end) {
01359
01360 entry=table[l] + (table_pos_t) start * ndsz;
01361
01362 if (lookup[word(entry)]!=-1) {
01363
01364 if ((slmt->cursize[l] % slmt->dict->size()) ==0)
01365 slmt->table[l]=(char *)realloc(slmt->table[l],(table_pos_t) (slmt->cursize[l]+slmt->dict->size()) * ndsz);
01366
01367 newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz;
01368 memcpy(newentry,entry,ndsz);
01369 if (!keepunigr)
01370 slmt->word(newentry,lookup[word(entry)]);
01371
01372 if (l<maxlev)
01373 slmt->bound(newentry,ndt,start);
01374 slmt->cursize[l]++;
01375 }
01376 start++;
01377 }
01378 }
01379
01380
01381 slmt->bound(slmt->table[l-1] + (table_pos_t) p * pndsz, pndt,slmt->cursize[l]);
01382 }
01383 }
01384 }
01385
01386 return;
01387 }
01388
01389
01390
01391
01392
01393 void lmtable::savetxt(const char *filename)
01394 {
01395
01396 fstream out(filename,ios::out);
01397 table_entry_pos_t cnt[1+MAX_NGRAM];
01398 int l;
01399
01400
01401 out.precision(6);
01402
01403 if (isQtable) {
01404 out << "qARPA " << maxlev;
01405 for (l=1; l<=maxlev; l++)
01406 out << " " << NumCenters[l];
01407 out << endl;
01408 }
01409
01410 ngram ng(lmtable::getDict(),0);
01411
01412 cerr << "savetxt: " << filename << "\n";
01413
01414 if (isPruned) ngcnt(cnt);
01415
01416 out << "\n\\data\\\n";
01417 char buff[100];
01418 for (l=1; l<=maxlev; l++) {
01419 sprintf(buff,"ngram %2d=%10d\n",l,(isPruned?cnt[l]:cursize[l]));
01420 out << buff;
01421
01422 }
01423 out << "\n";
01424
01425 for (l=1; l<=maxlev; l++) {
01426
01427 out << "\n\\" << l << "-grams:\n";
01428 cerr << "save: " << (isPruned?cnt[l]:cursize[l]) << " " << l << "-grams\n";
01429 if (isQtable) {
01430 out << NumCenters[l] << "\n";
01431 for (int c=0; c<NumCenters[l]; c++) {
01432 out << Pcenters[l][c];
01433 if (l<maxlev) out << " " << Bcenters[l][c];
01434 out << "\n";
01435 }
01436 }
01437
01438 ng.size=0;
01439 dumplm(out,ng,1,l,0,cursize[1]);
01440
01441 }
01442
01443 out << "\\end\\\n";
01444 cerr << "done\n";
01445 }
01446
01447
01448
01449 void lmtable::savebin(const char *filename)
01450 {
01451 VERBOSE(2,"lmtable::savebin START " << filename << "\n");
01452
01453 if (isPruned) {
01454 VERBOSE(2,"lmtable::savebin: pruned LM cannot be saved in binary form\n");
01455 exit(0);
01456 }
01457
01458
01459 fstream out(filename,ios::out);
01460
01461
01462 if (isQtable) {
01463 out << "Qblmt" << (isInverted?"I":"") << " " << maxlev;
01464 for (int i=1; i<=maxlev; i++) out << " " << cursize[i];
01465 out << "\nNumCenters";
01466 for (int i=1; i<=maxlev; i++) out << " " << NumCenters[i];
01467 out << "\n";
01468
01469 } else {
01470 out << "blmt" << (isInverted?"I":"") << " " << maxlev;
01471 char buff[100];
01472 for (int i=1; i<=maxlev; i++){
01473 sprintf(buff," %10d",cursize[i]);
01474 out << buff;
01475 }
01476 out << "\n";
01477 }
01478
01479 lmtable::getDict()->save(out);
01480
01481 for (int i=1; i<=maxlev; i++) {
01482 if (isQtable) {
01483 out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float));
01484 if (i<maxlev)
01485 out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float));
01486 }
01487 out.write(table[i],(table_pos_t) cursize[i]*nodesize(tbltype[i]));
01488 }
01489
01490 VERBOSE(2,"lmtable::savebin: END\n");
01491 }
01492
01493 void lmtable::savebin_dict(std::fstream& out)
01494 {
01495
01496
01497
01498
01499
01500
01501
01502 cerr << "savebin_dict ...\n";
01503 getDict()->save(out);
01504 }
01505
01506
01507
01508 void lmtable::appendbin_level(int level, fstream &out, int mmap)
01509 {
01510 if (getCurrentSize(level) > 0 ){
01511 if (mmap>0)
01512 appendbin_level_mmap(level, out);
01513 else {
01514 appendbin_level_nommap(level, out);
01515 }
01516 }
01517 }
01518
01519 void lmtable::appendbin_level_nommap(int level, fstream &out)
01520 {
01521 VERBOSE(2,"lmtable:appendbin_level_nommap START Level:" << level << std::endl);
01522
01523
01524
01525
01526
01527
01528
01529
01530 assert(level<=maxlev);
01531
01532
01533 if (isQtable) {
01534
01535 } else {
01536
01537 }
01538
01539 VERBOSE(3,"appending " << cursize[level] << " (maxsize:" << maxsize[level] << ") " << level << "-grams" << " table " << (void*) table << " table[level] " << (void*) table[level] << " out:" << (void*) out << endl);
01540
01541 if (isQtable) {
01542
01543 }
01544
01545 out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level]));
01546
01547 if (!out.good()) {
01548 perror("Something went wrong while writing");
01549 out.close();
01550 exit(2);
01551 }
01552
01553 VERBOSE(2,"lmtable:appendbin_level_nommap END Level:" << level << std::endl);
01554 }
01555
01556
01557 void lmtable::appendbin_level_mmap(int level, fstream &out)
01558 {
01559 UNUSED(out);
01560 cerr << "appending " << level << " (Actually do nothing)" <<std::endl;
01561 }
01562
01563 void lmtable::savebin_level(int level, const char* outfilename, int mmap)
01564 {
01565 if (mmap>0)
01566 savebin_level_mmap(level, outfilename);
01567 else {
01568 savebin_level_nommap(level, outfilename);
01569 }
01570 }
01571
01572 void lmtable::savebin_level_nommap(int level, const char* outfilename)
01573 {
01574 VERBOSE(2,"lmtable:savebin_level_nommap START" << requiredMaxlev << std::endl);
01575
01576
01577
01578
01579
01580
01581
01582
01583 assert(level<=maxlev);
01584
01585 char nameNgrams[BUFSIZ];
01586 sprintf(nameNgrams,"%s-%dgrams",outfilename,level);
01587
01588
01589
01590 fstream out(nameNgrams, ios::out|ios::binary);
01591
01592 if (out.fail()){
01593
01594 perror("cannot be opened");
01595 exit(3);
01596 }
01597
01598
01599 if (isQtable) {
01600
01601 } else {
01602
01603 }
01604
01605 VERBOSE(3,"saving " << cursize[level] << "(maxsize:" << maxsize[level] << ") " << level << "-grams in " << nameNgrams << " table " << (void*) table << " table[level] " << (void*) table[level] << " out:" << (void*) out << endl);
01606 if (isQtable) {
01607
01608 }
01609
01610 out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level]));
01611
01612 if (!out.good()) {
01613 std::cerr << " Something went wrong while writing temporary file " << nameNgrams << "\n";
01614 out.close();
01615 removefile(nameNgrams);
01616 exit(2);
01617 }
01618 out.close();
01619 if (out.fail()){
01620
01621 perror("cannot be closed");
01622 exit(3);
01623 }
01624
01625 VERBOSE(2,"lmtable:savebin_level_nommap END" << requiredMaxlev << std::endl);
01626 }
01627
01628 void lmtable::savebin_level_mmap(int level, const char* outfilename)
01629 {
01630 char nameNgrams[BUFSIZ];
01631 sprintf(nameNgrams,"%s-%dgrams",outfilename,level);
01632 VERBOSE(2,"saving " << level << "-grams probs in " << nameNgrams << " (Actually do nothing)" <<std::endl);
01633 }
01634
01635
01636
01637 void lmtable::print_table_stat()
01638 {
01639 cerr << "printing statistics of tables" << std::endl;
01640 for (int i=1; i<=maxlev; i++)
01641 print_table_stat(i);
01642 }
01643
01644 void lmtable::print_table_stat(int level)
01645 {
01646 cerr << " level: " << level
01647 << " maxsize[level]:" << maxsize[level]
01648 << " cursize[level]:" << cursize[level]
01649 << " tb_offset[level]:" << tb_offset[level]
01650 << " table:" << (void*) table
01651 << " table[level]:" << (void*) table[level]
01652 << " tableGaps[level]:" << (void*) tableGaps[level]
01653 << std::endl;
01654 }
01655
01656
01657 void lmtable::concatenate_all_levels(const char* fromfilename, const char* tofilename){
01658
01659
01660 for (int i=1; i<=maxlevel(); i++) {
01661 concatenate_single_level(i, fromfilename, tofilename);
01662 }
01663 }
01664
01665
01666 void lmtable::concatenate_single_level(int level, const char* fromfilename, const char* tofilename){
01667
01668 char fromnameNgrams[BUFSIZ];
01669 char tonameNgrams[BUFSIZ];
01670 sprintf(fromnameNgrams,"%s-%dgrams",fromfilename,level);
01671 sprintf(tonameNgrams,"%s-%dgrams",tofilename,level);
01672
01673 VERBOSE(2,"concatenating " << level << "-grams probs from " << fromnameNgrams << " to " << tonameNgrams<< std::endl);
01674
01675
01676
01677 char cmd[BUFSIZ];
01678 sprintf(cmd,"cat %s >> %s", fromnameNgrams, tonameNgrams);
01679 system(cmd);
01680 }
01681
01682
01683 void lmtable::remove_all_levels(const char* filename){
01684
01685 for (int i=1; i<=maxlevel(); i++) {
01686 remove_single_level(i,filename);
01687 }
01688 }
01689
01690
01691 void lmtable::remove_single_level(int level, const char* filename){
01692
01693 char nameNgrams[BUFSIZ];
01694 sprintf(nameNgrams,"%s-%dgrams",filename,level);
01695
01696
01697 removefile(nameNgrams);
01698 }
01699
01700
01701
01702
01703 void lmtable::delete_level(int level, const char* outfilename, int mmap){
01704 if (mmap>0)
01705 delete_level_mmap(level, outfilename);
01706 else {
01707 delete_level_nommap(level);
01708 }
01709 }
01710
01711 void lmtable::delete_level_mmap(int level, const char* outfilename)
01712 {
01713
01714 char nameNgrams[BUFSIZ];
01715 sprintf(nameNgrams,"%s-%dgrams",outfilename,level);
01716
01717
01718 table_pos_t filesize=(table_pos_t) cursize[level] * nodesize(tbltype[level]);
01719
01720
01721 Munmap(table[level]-tableGaps[level],(table_pos_t) filesize+tableGaps[level],0);
01722
01723 maxsize[level]=cursize[level]=0;
01724 }
01725
01726 void lmtable::delete_level_nommap(int level)
01727 {
01728 delete table[level];
01729 maxsize[level]=cursize[level]=0;
01730 }
01731
01732 void lmtable::compact_all_levels(const char* filename){
01733
01734 for (int i=1; i<=maxlevel(); i++) {
01735 compact_single_level(i,filename);
01736 }
01737 }
01738
01739 void lmtable::compact_single_level(int level, const char* filename)
01740 {
01741 char nameNgrams[BUFSIZ];
01742 sprintf(nameNgrams,"%s-%dgrams",filename,level);
01743
01744 VERBOSE(2,"concatenating " << level << "-grams probs from " << nameNgrams << " to " << filename<< std::endl);
01745
01746
01747
01748 char cmd[BUFSIZ];
01749 sprintf(cmd,"cat %s >> %s", nameNgrams, filename);
01750 system(cmd);
01751
01752
01753 removefile(nameNgrams);
01754 }
01755
01756 void lmtable::resize_level(int level, const char* outfilename, int mmap)
01757 {
01758 if (getCurrentSize(level) > 0 ){
01759 if (mmap>0)
01760 resize_level_mmap(level, outfilename);
01761 else {
01762 if (level<maxlev)
01763 resize_level_nommap(level);
01764 }
01765 }
01766 }
01767
01768 void lmtable::resize_level_mmap(int level, const char* outfilename)
01769 {
01770
01771 char nameNgrams[BUFSIZ];
01772 sprintf(nameNgrams,"%s-%dgrams",outfilename,level);
01773
01774
01775 table_pos_t filesize=(table_pos_t) cursize[level] * nodesize(tbltype[level]);
01776
01777
01778 FILE *fd = NULL;
01779 fd = fopen(nameNgrams, "r+");
01780
01781
01782 Munmap(table[level]-tableGaps[level],(table_pos_t) filesize+tableGaps[level],0);
01783 ftruncate(fileno(fd),filesize);
01784 table[level]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[level]));
01785 maxsize[level]=cursize[level];
01786 }
01787
01788 void lmtable::resize_level_nommap(int level)
01789 {
01790 VERBOSE(2,"lmtable::resize_level_nommap START Level " << level << "\n");
01791
01792
01793 table_pos_t filesize=(table_pos_t) cursize[level] * nodesize(tbltype[level]);
01794
01795 char* ptr = new char[filesize];
01796 memcpy(ptr,table[level],filesize);
01797 delete table[level];
01798 table[level]=ptr;
01799 maxsize[level]=cursize[level];
01800
01801 VERBOSE(2,"lmtable::resize_level_nommap END Level " << level << "\n");
01802 }
01803
01804
01805
01806
01807
01808 void lmtable::loadbin_header(istream& inp,const char* header)
01809 {
01810
01811
01812 inp >> maxlev;
01813
01814
01815 isInverted=false;
01816
01817 if (strncmp(header,"Qblmt",5)==0) {
01818 isQtable=true;
01819 if (strncmp(header,"QblmtI",6)==0)
01820 isInverted=true;
01821 } else if(strncmp(header,"blmt",4)==0) {
01822 isQtable=false;
01823 if (strncmp(header,"blmtI",5)==0)
01824 isInverted=true;
01825 } else error((char*)"loadbin: LM file is not in binary format");
01826
01827 configure(maxlev,isQtable);
01828
01829 for (int l=1; l<=maxlev; l++) {
01830 inp >> cursize[l];
01831 maxsize[l]=cursize[l];
01832 }
01833
01834 char header2[MAX_LINE];
01835 if (isQtable) {
01836 inp >> header2;
01837 for (int i=1; i<=maxlev; i++) {
01838 inp >> NumCenters[i];
01839 cerr << "reading " << NumCenters[i] << " centers\n";
01840 }
01841 }
01842 inp.getline(header2, MAX_LINE);
01843 }
01844
01845
01846 void lmtable::loadbin_codebook(istream& inp,int l)
01847 {
01848 Pcenters[l]=new float [NumCenters[l]];
01849 inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float));
01850 if (l<maxlev) {
01851 Bcenters[l]=new float [NumCenters[l]];
01852 inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float));
01853 }
01854 }
01855
01856
01857
01858
01859 void lmtable::loadbin(istream& inp, const char* header, const char* filename,int mmap)
01860 {
01861 cerr << "loadbin()\n";
01862 loadbin_header(inp,header);
01863 loadbin_dict(inp);
01864
01865 VERBOSE(3,"lmtable::maxlev" << maxlev << std::endl);
01866 if (maxlev>requiredMaxlev) maxlev=requiredMaxlev;
01867 VERBOSE(3,"lmtable::maxlev:" << maxlev << std::endl);
01868 VERBOSE(3,"lmtable::requiredMaxlev" << requiredMaxlev << std::endl);
01869
01870
01871 if (filename && mmap>0) {
01872
01873 #ifdef WIN32
01874 error("lmtable::loadbin mmap facility not yet supported under WIN32\n");
01875 #else
01876
01877 if (mmap <= maxlev) memmap=mmap;
01878 else error((char*)"keep_on_disk value is out of range\n");
01879
01880 if ((diskid=open(filename, O_RDONLY))<0) {
01881 std::cerr << "cannot open " << filename << "\n";
01882 error((char*)"dying");
01883 }
01884
01885
01886 char miniheader[4];
01887 read(diskid,miniheader,4);
01888 if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4))
01889 error((char*)"mmap functionality does not work with compressed binary LMs\n");
01890 #endif
01891 }
01892
01893 for (int l=1; l<=maxlev; l++) {
01894 loadbin_level(inp,l);
01895 }
01896 cerr << "done\n";
01897 }
01898
01899
01900
01901 void lmtable::loadbin_dict(istream& inp)
01902 {
01903 cerr << "lmtable::loadbin_dict()\n";
01904 lmtable::getDict()->load(inp);
01905 cerr << "dict->size(): " << lmtable::getDict()->size() << "\n";
01906 }
01907
01908
01909 void lmtable::loadbin_level(istream& inp, int level)
01910 {
01911 cerr << "loadbin_level (level " << level << ")\n";
01912
01913 if (isQtable) loadbin_codebook(inp,level);
01914 if ((memmap == 0) || (level < memmap)) {
01915 cerr << "loading " << cursize[level] << " " << level << "-grams\n";
01916 table[level]=new char[(table_pos_t) cursize[level] * nodesize(tbltype[level])];
01917 inp.read(table[level],(table_pos_t) cursize[level] * nodesize(tbltype[level]));
01918 } else {
01919
01920 #ifdef WIN32
01921 error((char*)"mmap not available under WIN32\n");
01922 #else
01923 cerr << "mapping " << cursize[level] << " " << level << "-grams\n";
01924 tableOffs[level]=inp.tellg();
01925 table[level]=(char *)MMap(diskid,PROT_READ,
01926 tableOffs[level], (table_pos_t) cursize[level]*nodesize(tbltype[level]),
01927 &tableGaps[level]);
01928 table[level]+=(table_pos_t) tableGaps[level];
01929 cerr << "tableOffs " << tableOffs[level] << " tableGaps" << tableGaps[level] << "-grams\n";
01930 inp.seekg((table_pos_t) cursize[level]*nodesize(tbltype[level]),ios_base::cur);
01931 #endif
01932 }
01933 cerr << "done (level " << level <<")\n";
01934 }
01935
01936 int lmtable::get(ngram& ng,int n,int lev)
01937 {
01938 totget[lev]++;
01939
01940 if (lev > maxlev) error((char*)"get: lev exceeds maxlevel");
01941 if (n < lev) error((char*)"get: ngram is too small");
01942
01943
01944 table_entry_pos_t offset=0,limit=cursize[1];
01945
01946
01947 table_entry_pos_t hit;
01948 char* found;
01949 LMT_TYPE ndt;
01950 ng.link=NULL;
01951 ng.lev=0;
01952
01953 for (int l=1; l<=lev; l++) {
01954
01955
01956 hit = 0 ;
01957 found = NULL;
01958 ndt=tbltype[l];
01959
01960 #ifdef LMT_CACHE_ENABLE
01961 if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),found)) {
01962 hit=1;
01963 } else {
01964 search(l,
01965 offset,
01966 (limit-offset),
01967 nodesize(ndt),
01968 ng.wordp(n-l+1),
01969 LMT_FIND,
01970 &found);
01971 }
01972
01973
01974
01975
01976
01977 if (lmtcache[l] && hit==0) {
01978
01979 const char* found2=found;
01980 lmtcache[l]->add(ng.wordp(n),found2);
01981 }
01982 #else
01983 search(l,
01984 offset,
01985 (limit-offset),
01986 nodesize(ndt),
01987 ng.wordp(n-l+1),
01988 LMT_FIND,
01989 &found);
01990 #endif
01991
01992 if (!found) return 0;
01993
01994 float pr = prob(found,ndt);
01995 if (pr==NOPROB) return 0;
01996
01997 ng.path[l]=found;
01998 ng.bow=(l<maxlev?bow(found,ndt):0);
01999 ng.prob=pr;
02000 ng.link=found;
02001 ng.info=ndt;
02002 ng.lev=l;
02003
02004 if (l<maxlev) {
02005
02006
02007 if (offset+1==cursize[l]) limit=cursize[l+1];
02008 else limit=bound(found,ndt);
02009
02010
02011 if (found==table[l]) offset=0;
02012 else offset=bound((found - nodesize(ndt)),ndt);
02013
02014 assert(offset!=BOUND_EMPTY1);
02015 assert(limit!=BOUND_EMPTY1);
02016 }
02017 }
02018
02019
02020
02021 ng.size=n;
02022 ng.freq=0;
02023 ng.succ=(lev<maxlev?limit-offset:0);
02024
02025 #ifdef TRACE_CACHELM
02026 if (ng.size==maxlev && sentence_id>0) {
02027 *cacheout << sentence_id << " miss " << ng << " " << ng.link << "\n";
02028 }
02029 #endif
02030 return 1;
02031 }
02032
02033
02034
02035
02036 void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos)
02037 {
02038
02039 LMT_TYPE ndt=tbltype[ilev];
02040 ngram ing(ng.dict);
02041 int ndsz=nodesize(ndt);
02042
02043 assert(ng.size==ilev-1);
02044
02045
02046 assert(epos<=cursize[ilev]);
02047 assert(ipos<epos);
02048 ng.pushc(0);
02049
02050 for (table_entry_pos_t i=ipos; i<epos; i++) {
02051 char* found=table[ilev]+ (table_pos_t) i * ndsz;
02052 *ng.wordp(1)=word(found);
02053
02054 float ipr=prob(found,ndt);
02055
02056
02057 if(isPruned && ipr==NOPROB) continue;
02058
02059 if (ilev<elev) {
02060
02061 table_entry_pos_t isucc=(i>0?bound(table[ilev]+ (table_pos_t) (i-1) * ndsz,ndt):0);
02062 table_entry_pos_t esucc=bound(found,ndt);
02063
02064
02065 if (isucc < esucc)
02066 dumplm(out,ng,ilev+1,elev,isucc,esucc);
02067 } else {
02068 out << ipr <<"\t";
02069
02070
02071 if (isInverted & ng.size>1) {
02072 ing.invert(ng);
02073 for (int k=ing.size; k>=1; k--) {
02074 if (k<ing.size) out << " ";
02075 out << lmtable::getDict()->decode(*ing.wordp(k));
02076 }
02077 } else {
02078 for (int k=ng.size; k>=1; k--) {
02079 if (k<ng.size) out << " ";
02080 out << lmtable::getDict()->decode(*ng.wordp(k));
02081 }
02082 }
02083
02084 if (ilev<maxlev) {
02085 float ibo=bow(table[ilev]+ (table_pos_t)i * ndsz,ndt);
02086 if (isQtable){
02087 out << "\t" << ibo;
02088 }
02089 else{
02090 if ((ibo>UPPER_SINGLE_PRECISION_OF_0 || ibo<-UPPER_SINGLE_PRECISION_OF_0)) out << "\t" << ibo;
02091 }
02092 }
02093 out << "\n";
02094 }
02095 }
02096 }
02097
02098
02099
02100
02101 int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev)
02102 {
02103 assert(lev==h.lev+1 && h.size==lev && lev<=maxlev);
02104
02105 LMT_TYPE ndt=tbltype[h.lev];
02106 int ndsz=nodesize(ndt);
02107
02108 table_entry_pos_t offset;
02109 switch (action) {
02110
02111 case LMT_INIT:
02112
02113
02114 ng.size=lev;
02115 ng.trans(h);
02116
02117 ng.midx[lev]=0;
02118 offset=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0);
02119 h.succ=bound(h.link,ndt)-offset;
02120 h.succlink=table[lev]+(table_pos_t) offset * nodesize(tbltype[lev]);
02121 return 1;
02122
02123 case LMT_CONT:
02124 if (ng.midx[lev] < h.succ) {
02125
02126 *ng.wordp(1)=word(h.succlink+(table_pos_t) ng.midx[lev]*nodesize(tbltype[lev]));
02127 ng.midx[lev]++;
02128 return 1;
02129 } else
02130 return 0;
02131
02132 default:
02133 cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n";
02134 exit(0);
02135 }
02136 }
02137
02138
02139
02140
02141
02142
02143
02144 const char *lmtable::maxsuffptr(ngram ong, unsigned int* size)
02145 {
02146
02147
02148
02149 if (ong.size==0) {
02150 if (size!=NULL) *size=0;
02151 return (char*) NULL;
02152 }
02153
02154 if (isInverted) {
02155 if (ong.size>maxlev) ong.size=maxlev;
02156 ngram ing=ong;
02157
02158 ing.invert(ong);
02159
02160
02161 get(ing,ing.size,ing.size);
02162 if (ing.lev > 0) {
02163 unsigned int isize = MIN(ing.lev,(ing.size-1));
02164 if (size!=NULL) *size=isize;
02165 return ing.path[isize];
02166 } else {
02167 if (size!=NULL) *size=0;
02168 return NULL;
02169 }
02170 } else {
02171 if (ong.size>0) ong.size--;
02172
02173 if (ong.size>=maxlev) ong.size=maxlev-1;
02174
02175 if (size!=NULL) *size=ong.size;
02176
02177 for (ngram ng=ong; ng.size>0; ng.size--) {
02178 if (get(ng,ng.size,ng.size)) {
02179 if (ng.succ==0) (*size)--;
02180 if (size!=NULL) *size=ng.size;
02181 return ng.link;
02182 }
02183 }
02184 if (size!=NULL) *size=0;
02185 return NULL;
02186 }
02187 }
02188
02189
02190 const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size)
02191 {
02192
02193
02194
02195
02196 if (size!=NULL) *size=ong.size;
02197 if (ong.size==0) return (char*) NULL;
02198
02199 char* found;
02200 unsigned int isize;
02201
02202 #ifdef PS_CACHE_ENABLE
02203 prob_and_state_t pst;
02204
02205 size_t orisize=ong.size;
02206 if (ong.size>=maxlev) ong.size=maxlev-1;
02207
02208
02209 if (prob_and_state_cache && (ong.size==maxlev-1) && prob_and_state_cache->get(ong.wordp(maxlev-1),pst)) {
02210 *size=pst.statesize;
02211 return pst.state;
02212 }
02213 ong.size = orisize;
02214 #endif
02215
02216
02217 found=(char *)maxsuffptr(ong,&isize);
02218
02219 #ifdef PS_CACHE_ENABLE
02220
02221 if (ong.size>=maxlev) ong.size=maxlev-1;
02222 if (prob_and_state_cache && ong.size==maxlev-1) {
02223 pst.state=found;
02224 pst.statesize=isize;
02225 prob_and_state_cache->add(ong.wordp(maxlev-1),pst);
02226 }
02227 #endif
02228
02229 if (size!=NULL) *size=isize;
02230
02231 return found;
02232 }
02233
02234
02235
02236
02237
02238
02239
02240
02241
02242
02243
02244
02245
02246
02247 double lmtable::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,
02248 bool* extendible, double *lastbow)
02249 {
02250 VERBOSE(3," lmtable::lprob(ngram) ong " << ong << "\n");
02251
02252 if (ong.size==0) return 0.0;
02253 if (ong.size>maxlev) ong.size=maxlev;
02254
02255 if (bow) *bow=0;
02256 if (bol) *bol=0;
02257
02258
02259 double rbow=0,lpr=0;
02260 float ibow,iprob;
02261
02262
02263 if (isInverted) {
02264 ngram ing=ong;
02265
02266 ing.invert(ong);
02267 get(ing,ing.size,ing.size);
02268 if (ing.lev >0) {
02269 iprob=ing.prob;
02270 lpr = (double)(isQtable?Pcenters[ing.lev][(qfloat_t)iprob]:iprob);
02271 if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty;
02272 if (statesize) *statesize=MIN(ing.lev,(ing.size-1));
02273 if (maxsuffptr) *maxsuffptr=ing.path[MIN(ing.lev,(ing.size-1))];
02274 if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0;
02275 if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow);
02276 } else {
02277 lpr=-log(UNIGRAM_RESOLUTION)/M_LN10;
02278 if (statesize) *statesize=0;
02279 if (maxsuffptr) *maxsuffptr=NULL;
02280 }
02281
02282 if (ing.lev < ing.size) {
02283 int depth=(ing.lev>0?ing.lev:1);
02284 if (bol) *bol=ing.size-depth;
02285 ing.size--;
02286 get(ing,ing.size,ing.size);
02287 if (ing.lev>0) {
02288
02289 for (int l=depth; l<=ing.lev; l++) {
02290
02291 assert(ing.path[l]!=NULL);
02292 ibow=this->bow(ing.path[l],tbltype[l]);
02293 rbow+= (double) (isQtable?Bcenters[l][(qfloat_t)ibow]:ibow);
02294
02295
02296 if (isQtable && (*ing.wordp(ing.size)==dict->oovcode())) {
02297 rbow-=(double)Bcenters[l][(qfloat_t)ibow];
02298 }
02299 }
02300 }
02301 }
02302
02303 if (bow) (*bow)=rbow;
02304 return rbow + lpr;
02305 }
02306 else {
02307 assert(extendible==NULL && lastbow==NULL);
02308 for (ngram ng=ong; ng.size>0; ng.size--) {
02309 if (get(ng,ng.size,ng.size)) {
02310 iprob=ng.prob;
02311 lpr = (double)(isQtable?Pcenters[ng.size][(qfloat_t)iprob]:iprob);
02312 if (*ng.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty;
02313 if (maxsuffptr || statesize) {
02314 if (ong.size==ng.size) {
02315 ng.size--;
02316 get(ng,ng.size,ng.size);
02317 }
02318 if (statesize) *statesize=ng.size;
02319 if (maxsuffptr) *maxsuffptr=ng.link;
02320 }
02321 return rbow+lpr;
02322 } else {
02323 if (ng.size==1) {
02324 if (maxsuffptr) *maxsuffptr=NULL;
02325 if (statesize) *statesize=0;
02326 return rbow -log(UNIGRAM_RESOLUTION)/M_LN10;
02327 } else {
02328 if (bol) (*bol)++;
02329 if (ng.lev==(ng.size-1)) {
02330 ibow=ng.bow;
02331 rbow+= (double) (isQtable?Bcenters[ng.lev][(qfloat_t)ibow]:ibow);
02332
02333 if (isQtable && (*ng.wordp(2)==dict->oovcode())) {
02334 rbow-=(double)Bcenters[ng.lev][(qfloat_t)ibow];
02335 }
02336 }
02337 if (bow) (*bow)=rbow;
02338 }
02339
02340 }
02341
02342 }
02343 }
02344 assert(0);
02345 return 1.0;
02346 }
02347
02348
02349
02350 double lmtable::clprob(ngram ong,double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
02351 {
02352 VERBOSE(3," lmtable::clprob(ngram), parameter = <" << ong << ">\n");
02353
02354 #ifdef TRACE_CACHELM
02355 if (probcache && ong.size==maxlev && sentence_id>0) {
02356 *cacheout << sentence_id << " " << ong << "\n";
02357 }
02358 #endif
02359
02360 if (ong.size==0) {
02361 if (statesize!=NULL) *statesize=0;
02362 if (state!=NULL) *state=NULL;
02363 if (extendible!=NULL) *extendible=false;
02364 return 0.0;
02365 }
02366
02367 if (ong.size>maxlev) ong.size=maxlev;
02368
02369 #ifdef PS_CACHE_ENABLE
02370 double logpr = 0.0;
02371
02372 prob_and_state_t pst_get;
02373
02374 if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst_get)) {
02375 logpr=pst_get.logpr;
02376 if (bow) *bow = pst_get.bow;
02377 if (bol) *bol = pst_get.bol;
02378 if (state) *state = pst_get.state;
02379 if (statesize) *statesize = pst_get.statesize;
02380 if (extendible) *extendible = pst_get.extendible;
02381
02382 return logpr;
02383 }
02384
02385
02386
02387 prob_and_state_t pst_add;
02388 logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible));
02389
02390
02391 if (bow) *bow = pst_add.bow;
02392 if (bol) *bol = pst_add.bol;
02393 if (state) *state = pst_add.state;
02394 if (statesize) *statesize = pst_add.statesize;
02395 if (extendible) *extendible = pst_add.extendible;
02396
02397
02398 if (prob_and_state_cache && ong.size==maxlev) {
02399 prob_and_state_cache->add(ong.wordp(maxlev),pst_add);
02400 }
02401 return logpr;
02402 #else
02403 return lmtable::lprob(ong, bow, bol, state, statesize, extendible);
02404 #endif
02405 };
02406
02407
02408
02409
02410 double lmtable::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
02411 {
02412 VERBOSE(3," lmmacro::clprob(int*, int,...)\n");
02413 #ifdef TRACE_CACHELM
02414 if (probcache && sz==maxlev && sentence_id>0) {
02415 *cacheout << sentence_id << "\n";
02416
02417 }
02418 #endif
02419
02420 if (sz==0) {
02421 if (statesize!=NULL) *statesize=0;
02422 if (state!=NULL) *state=NULL;
02423 if (extendible!=NULL) *extendible=false;
02424 return 0.0;
02425 }
02426
02427 if (sz>maxlev) sz=maxlev;
02428
02429 double logpr = 0.0;
02430
02431 #ifdef PS_CACHE_ENABLE
02432
02433 prob_and_state_t pst_get;
02434
02435 if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst_get)) {
02436 logpr=pst_get.logpr;
02437 if (bow) *bow = pst_get.bow;
02438 if (bol) *bol = pst_get.bol;
02439 if (state) *state = pst_get.state;
02440 if (statesize) *statesize = pst_get.statesize;
02441 if (extendible) *extendible = pst_get.extendible;
02442
02443 return logpr;
02444 }
02445
02446
02447
02448 ngram ong(dict);
02449 ong.pushc(codes,sz);
02450 assert (ong.size == sz);
02451
02452
02453 prob_and_state_t pst_add;
02454 logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible));
02455
02456
02457 if (bow) *bow = pst_add.bow;
02458 if (bol) *bol = pst_add.bol;
02459 if (state) *state = pst_add.state;
02460 if (statesize) *statesize = pst_add.statesize;
02461 if (extendible) *extendible = pst_add.extendible;
02462
02463
02464 if (prob_and_state_cache && ong.size==maxlev) {
02465 prob_and_state_cache->add(ong.wordp(maxlev),pst_add);
02466 }
02467 return logpr;
02468 #else
02469
02470
02471 ngram ong(dict);
02472 ong.pushc(codes,sz);
02473 assert (ong.size == sz);
02474
02475 logpr = lmtable::lprob(ong, bow, bol, state, statesize, extendible);
02476 return logpr;
02477 #endif
02478 };
02479
02480
02481 int lmtable::succrange(node ndp,int level,table_entry_pos_t* isucc,table_entry_pos_t* esucc)
02482 {
02483 table_entry_pos_t first,last;
02484 LMT_TYPE ndt=tbltype[level];
02485
02486
02487 if (level<maxlev) {
02488 first = ndp>table[level]? bound(ndp-nodesize(ndt), ndt) : 0;
02489 last = bound(ndp, ndt);
02490 } else {
02491 first=last=0;
02492 }
02493 if (isucc) *isucc=first;
02494 if (esucc) *esucc=last;
02495
02496 return last-first;
02497 }
02498
02499
02500 void lmtable::stat(int level)
02501 {
02502 table_pos_t totmem=0,memory;
02503 float mega=1024 * 1024;
02504
02505 cout.precision(2);
02506
02507 cout << "lmtable class statistics\n";
02508
02509 cout << "levels " << maxlev << "\n";
02510 for (int l=1; l<=maxlev; l++) {
02511 memory=(table_pos_t) cursize[l] * nodesize(tbltype[l]);
02512 cout << "lev " << l
02513 << " entries "<< cursize[l]
02514 << " used mem " << memory/mega << "Mb\n";
02515 totmem+=memory;
02516 }
02517
02518 cout << "total allocated mem " << totmem/mega << "Mb\n";
02519
02520 cout << "total number of get and binary search calls\n";
02521 for (int l=1; l<=maxlev; l++) {
02522 cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n";
02523 }
02524
02525 if (level >1 ) lmtable::getDict()->stat();
02526
02527 }
02528
02529 void lmtable::reset_mmap()
02530 {
02531 #ifndef WIN32
02532 if (memmap>0 and memmap<=maxlev)
02533 for (int l=memmap; l<=maxlev; l++) {
02534
02535 Munmap(table[l]-tableGaps[l],(table_pos_t) cursize[l]*nodesize(tbltype[l])+tableGaps[l],0);
02536 table[l]=(char *)MMap(diskid,PROT_READ,
02537 tableOffs[l], (table_pos_t)cursize[l]*nodesize(tbltype[l]),
02538 &tableGaps[l]);
02539 table[l]+=(table_pos_t)tableGaps[l];
02540 }
02541 #endif
02542 }
02543
02544
02545
02546
02547
02548
02549
02550 double lmtable::lprobx(ngram ong,
02551 double *lkp,
02552 double *bop,
02553 int *bol)
02554 {
02555 double bo, lbo, pr;
02556 float ipr;
02557
02558 ngram ng(dict), ctx(dict);
02559
02560 if(bol) *bol=0;
02561 if(ong.size==0) {
02562 if(lkp) *lkp=0;
02563 return 0;
02564 }
02565 if(ong.size>maxlev) ong.size=maxlev;
02566 ctx = ng = ong;
02567 bo=0;
02568 ctx.shift();
02569 while(!get(ng)) {
02570
02571
02572 if(ng.size==1) {
02573 pr = -log(UNIGRAM_RESOLUTION)/M_LN10;
02574 if(lkp) *lkp=pr;
02575 pr += bo;
02576 return pr;
02577 }
02578
02579 lbo = 0.0;
02580 if(get(ctx)) {
02581 ipr = ctx.bow;
02582 lbo = isQtable?Bcenters[ng.size][(qfloat_t)ipr]:ipr;
02583
02584 }
02585 if(bop) *bop++=lbo;
02586 if(bol) ++*bol;
02587 bo += lbo;
02588 ng.size--;
02589 ctx.size--;
02590 }
02591 ipr = ng.prob;
02592 pr = isQtable?Pcenters[ng.size][(qfloat_t)ipr]:ipr;
02593
02594 if(lkp) *lkp=pr;
02595 pr += bo;
02596 return pr;
02597 }
02598
02599
02600
02601 table_entry_pos_t lmtable::wdprune(float *thr, int aflag)
02602 {
02603
02604
02605 int l;
02606 ngram ng(lmtable::getDict(),0);
02607
02608 isPruned=true;
02609
02610 ng.size=0;
02611
02612 double tlk, bo, ts, tbs;
02613 tlk = bo = ts = tbs = 0;
02614
02615 for(l=2; l<=maxlev; l++) wdprune(thr, aflag, ng, 1, l, 0, cursize[1]);
02616 return 0;
02617 }
02618
02619
02620
02621 table_entry_pos_t lmtable::wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double tlk, double bo, double *ts, double *tbs)
02622 {
02623 LMT_TYPE ndt=tbltype[ilev];
02624 int ndsz=nodesize(ndt);
02625 char *ndp;
02626 float lk;
02627 float ipr, ibo;
02628
02629 table_entry_pos_t i, k, nk;
02630
02631 assert(ng.size==ilev-1);
02632
02633 assert(epos<=cursize[ilev] && ipos<epos);
02634
02635 ng.pushc(0);
02636
02637 for(i=ipos, nk=0; i<epos; i++) {
02638
02639
02640 ndp = table[ilev]+(table_pos_t)i*ndsz;
02641 *ng.wordp(1) = word(ndp);
02642
02643
02644 ipr = prob(ndp, ndt);
02645 if(ipr==NOPROB) continue;
02646
02647 if ((ilev == 1) && (*ng.wordp(ng.size) == getDict()->getcode(BOS_))) {
02648
02649
02650 ipr = 0.0;
02651 }
02652 lk = ipr;
02653
02654 if(ilev<elev) {
02655
02656
02657 ibo = bow(ndp, ndt);
02658 bo = ibo;
02659
02660
02661 table_entry_pos_t isucc,esucc;
02662 succrange(ndp,ilev,&isucc,&esucc);
02663
02664
02665
02666 if(isucc>=esucc) continue;
02667
02668
02669
02670 prune:
02671 double nextlevel_ts=0, nextlevel_tbs=0;
02672 k = wdprune(thr, aflag, ng, ilev+1, elev, isucc, esucc, tlk+lk, bo, &nextlevel_ts, &nextlevel_tbs);
02673
02674 if(ilev!=elev-1) continue;
02675 if(nextlevel_ts>=1 || nextlevel_tbs>=1) {
02676 cerr << "ng: " << ng
02677 <<" nextlevel_ts=" << nextlevel_ts
02678 <<" nextlevel_tbs=" << nextlevel_tbs
02679 <<" k=" << k
02680 <<" ns=" << esucc-isucc
02681 << "\n";
02682 if(nextlevel_ts>=1) {
02683 pscale(ilev+1, isucc, esucc, 0.999999/nextlevel_ts);
02684 goto prune;
02685 }
02686 }
02687
02688
02689 bo = log((1-nextlevel_ts)/(1-nextlevel_tbs))/M_LN10;
02690 ibo=(float)bo;
02691 bow(ndp, ndt, ibo);
02692 } else {
02693
02694
02695 ngram bng = ng;
02696 bng.size--;
02697 double blk = lprob(bng);
02698
02699 double wd = pow(10., tlk+lk) * (lk-bo-blk);
02700 if(aflag&&wd<0) wd=-wd;
02701 if(wd > thr[elev-1]) {
02702 *ts += pow(10., lk);
02703 *tbs += pow(10., blk);
02704 } else {
02705 ++nk;
02706 prob(ndp, ndt, NOPROB);
02707 }
02708 }
02709 }
02710 return nk;
02711 }
02712
02713 int lmtable::pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s)
02714 {
02715 LMT_TYPE ndt=tbltype[lev];
02716 int ndsz=nodesize(ndt);
02717 char *ndp;
02718 float ipr;
02719
02720 s=log(s)/M_LN10;
02721 ndp = table[lev]+ (table_pos_t) ipos*ndsz;
02722 for(table_entry_pos_t i=ipos; i<epos; ndp+=ndsz,i++) {
02723 ipr = prob(ndp, ndt);
02724 if(ipr==NOPROB) continue;
02725 ipr+=(float) s;
02726 prob(ndp, ndt, ipr);
02727 }
02728 return 0;
02729 }
02730
02731
02732 table_entry_pos_t lmtable::ngcnt(table_entry_pos_t *cnt)
02733 {
02734 ngram ng(lmtable::getDict(),0);
02735 memset(cnt, 0, (maxlev+1)*sizeof(*cnt));
02736 ngcnt(cnt, ng, 1, 0, cursize[1]);
02737 return 0;
02738 }
02739
02740
02741 table_entry_pos_t lmtable::ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos)
02742 {
02743
02744 table_entry_pos_t i, isucc, esucc;
02745 float ipr;
02746 char *ndp;
02747 LMT_TYPE ndt=tbltype[l];
02748 int ndsz=nodesize(ndt);
02749
02750 ng.pushc(0);
02751 for(i=ipos; i<epos; i++) {
02752 ndp = table[l]+(table_pos_t) i*ndsz;
02753 *ng.wordp(1)=word(ndp);
02754 ipr=prob(ndp, ndt);
02755 if(ipr==NOPROB) continue;
02756 ++cnt[l];
02757 if(l==maxlev) continue;
02758 succrange(ndp,l,&isucc,&esucc);
02759 if(isucc < esucc) ngcnt(cnt, ng, l+1, isucc, esucc);
02760 }
02761 return 0;
02762 }
02763
02764
02765
02766