00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 using namespace std;
00022
00023 #include <iostream>
00024 #include <cmath>
00025 #include <math.h>
00026 #include "cmd.h"
00027 #include "mfstream.h"
00028 #include "mempool.h"
00029 #include "htable.h"
00030 #include "dictionary.h"
00031 #include "n_gram.h"
00032 #include "mempool.h"
00033 #include "ngramtable.h"
00034 #include "interplm.h"
00035 #include "normcache.h"
00036 #include "ngramcache.h"
00037 #include "mdiadapt.h"
00038 #include "shiftlm.h"
00039 #include "linearlm.h"
00040 #include "mixture.h"
00041 #include "lmtable.h"
00042
00043
00044 #define NGRAM 1
00045 #define SEQUENCE 2
00046 #define ADAPT 3
00047 #define TURN 4
00048 #define TEXT 5
00049
00050 static Enum_T LmTypeEnum [] = {
00051 { (char*)"ModifiedShiftBeta", MOD_SHIFT_BETA },
00052 { (char*)"msb", MOD_SHIFT_BETA },
00053 { (char*)"InterpShiftBeta", SHIFT_BETA },
00054 { (char*)"ShiftBeta", SHIFT_BETA },
00055 { (char*)"sb", SHIFT_BETA },
00056 { (char*)"InterpShiftOne", SHIFT_ONE },
00057 { (char*)"ShiftOne", SHIFT_ONE },
00058 { (char*)"s1", SHIFT_ONE },
00059 { (char*)"LinearWittenBell", LINEAR_WB },
00060 { (char*)"wb", LINEAR_WB },
00061 { (char*)"LinearGoodTuring", LINEAR_GT },
00062 { (char*)"Mixture", MIXTURE },
00063 { (char*)"mix", MIXTURE },
00064 END_ENUM
00065 };
00066
00067
00068 static Enum_T InteractiveModeEnum [] = {
00069 { (char*)"Ngram", NGRAM },
00070 { (char*)"Sequence", SEQUENCE },
00071 { (char*)"Adapt", ADAPT },
00072 { (char*)"Turn", TURN },
00073 { (char*)"Text", TEXT },
00074 { (char*)"Yes", NGRAM },
00075 END_ENUM
00076 };
00077
00078 void print_help(int TypeFlag=0){
00079 std::cerr << std::endl << "tlm - estimates a language model" << std::endl;
00080 std::cerr << std::endl << "USAGE:" << std::endl;
00081 std::cerr << " not yet available" << std::endl;
00082 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00083 std::cerr << " tlm is a tool for the estimation of language model" << std::endl;
00084 std::cerr << std::endl << "OPTIONS:" << std::endl;
00085 std::cerr << " -Help|-h this help" << std::endl;
00086 std::cerr << std::endl;
00087
00088 FullPrintParams(TypeFlag, 0, 1, stderr);
00089 }
00090
00091 void usage(const char *msg = 0)
00092 {
00093 if (msg){
00094 std::cerr << msg << std::endl;
00095 }
00096 else{
00097 print_help();
00098 }
00099 exit(1);
00100 }
00101
00102 int main(int argc, char **argv)
00103 {
00104
00105 char *dictfile=NULL;
00106 char *trainfile=NULL;
00107 char *testfile=NULL;
00108 char *adaptfile=NULL;
00109 char *slminfo=NULL;
00110
00111 char *imixpar=NULL;
00112 char *omixpar=NULL;
00113
00114 char *BINfile=NULL;
00115 char *ARPAfile=NULL;
00116 bool SavePerLevel=true;
00117
00118 char *ASRfile=NULL;
00119
00120 char* scalefactorfile=NULL;
00121
00122 int backoff=0;
00123 int lmtype=0;
00124 int dub=0;
00125 int size=0;
00126
00127 int interactive=0;
00128 int statistics=0;
00129
00130 int prunefreq=0;
00131 bool prunesingletons=true;
00132 bool prunetopsingletons=false;
00133
00134 double beta=-1;
00135
00136 bool compsize=false;
00137 bool checkpr=false;
00138 double oovrate=0;
00139 int max_caching_level=0;
00140
00141 char *outpr=NULL;
00142
00143 bool memmap = false;
00144
00145 int adaptlevel=0;
00146 double adaptrate=1.0;
00147 bool adaptoov=false;
00148
00149 bool help=false;
00150
00151 DeclareParams((char*)
00152 "Back-off",CMDBOOLTYPE|CMDMSG, &backoff, "boolean flag for backoff LM (default is false, i.e. interpolated LM)",
00153 "bo",CMDBOOLTYPE|CMDMSG, &backoff, "boolean falg for backoff LM (default is false, i.e. interpolated LM)",
00154 "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary to filter the LM (default is NULL)",
00155 "d", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary to filter the LM (default is NULL)",
00156
00157 "DictionaryUpperBound", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
00158 "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
00159
00160 "NgramSize", CMDSUBRANGETYPE|CMDMSG, &size, 1, MAX_NGRAM, "order of the LM",
00161 "n", CMDSUBRANGETYPE|CMDMSG, &size, 1, MAX_NGRAM, "order of the LM",
00162
00163 "Ngram", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file",
00164 "TrainOn", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file",
00165 "tr", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file",
00166
00167 "oASR", CMDSTRINGTYPE|CMDMSG, &ASRfile, "output file in ASR format",
00168 "oasr", CMDSTRINGTYPE|CMDMSG, &ASRfile, "output file in ASR format",
00169
00170 "o", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format",
00171 "oARPA", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format",
00172 "oarpa", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format",
00173
00174 "oBIN", CMDSTRINGTYPE|CMDMSG, &BINfile, "output file in binary format",
00175 "obin", CMDSTRINGTYPE|CMDMSG, &BINfile, "output file in binary format",
00176
00177 "SavePerLevel",CMDBOOLTYPE|CMDMSG, &SavePerLevel, "saving type of the LM (true: per level (default), false: per word)",
00178 "spl",CMDBOOLTYPE|CMDMSG, &SavePerLevel, "saving type of the LM (true: per level (default), false: per word)",
00179
00180 "TestOn", CMDSTRINGTYPE|CMDMSG, &testfile, "file for testing",
00181 "te", CMDSTRINGTYPE|CMDMSG, &testfile, "file for testing",
00182
00183 "AdaptOn", CMDSTRINGTYPE|CMDMSG, &adaptfile, "file for adaptation",
00184 "ad", CMDSTRINGTYPE|CMDMSG, &adaptfile, "file for adaptation",
00185
00186 "AdaptRate",CMDDOUBLETYPE|CMDMSG , &adaptrate, "adaptation rate",
00187 "ar", CMDDOUBLETYPE|CMDMSG, &adaptrate, "adaptation rate",
00188
00189 "AdaptLevel", CMDSUBRANGETYPE|CMDMSG, &adaptlevel, 1 , MAX_NGRAM, "adaptation level",
00190 "al",CMDSUBRANGETYPE|CMDMSG, &adaptlevel, 1, MAX_NGRAM, "adaptation level",
00191
00192 "AdaptOOV", CMDBOOLTYPE|CMDMSG, &adaptoov, "boolean flag for increasing the dictionary during adaptation (default is false)",
00193 "ao", CMDBOOLTYPE|CMDMSG, &adaptoov, "boolean flag for increasing the dictionary during adaptation (default is false)",
00194
00195 "SaveScaleFactor", CMDSTRINGTYPE|CMDMSG, &scalefactorfile, "output file for the scale factors",
00196 "ssf", CMDSTRINGTYPE|CMDMSG, &scalefactorfile, "output file for the scale factors",
00197
00198 "LanguageModelType",CMDENUMTYPE|CMDMSG, &lmtype, LmTypeEnum, "type of the LM",
00199 "lm",CMDENUMTYPE|CMDMSG, &lmtype, LmTypeEnum, "type of the LM",
00200
00201 "Interactive",CMDENUMTYPE|CMDMSG, &interactive, InteractiveModeEnum, "type of interaction",
00202 "i",CMDENUMTYPE|CMDMSG, &interactive, InteractiveModeEnum, "type of interaction",
00203
00204 "Statistics",CMDSUBRANGETYPE|CMDMSG, &statistics, 1, 3, "output statistics of the LM of increasing detail (default is 0)",
00205 "s",CMDSUBRANGETYPE|CMDMSG, &statistics, 1, 3, "output statistics of the LM of increasing detail (default is 0)",
00206
00207 "PruneThresh",CMDSUBRANGETYPE|CMDMSG, &prunefreq, 0, 1000, "threshold for pruning (default is 0)",
00208 "p",CMDSUBRANGETYPE|CMDMSG, &prunefreq, 0, 1000, "threshold for pruning (default is 0)",
00209
00210 "PruneSingletons",CMDBOOLTYPE|CMDMSG, &prunesingletons, "boolean flag for pruning of singletons (default is true)",
00211 "ps",CMDBOOLTYPE|CMDMSG, &prunesingletons, "boolean flag for pruning of singletons (default is true)",
00212
00213 "PruneTopSingletons",CMDBOOLTYPE|CMDMSG, &prunetopsingletons, "boolean flag for pruning of singletons at the top level (default is false)",
00214 "pts",CMDBOOLTYPE|CMDMSG, &prunetopsingletons, "boolean flag for pruning of singletons at the top level (default is false)",
00215
00216 "ComputeLMSize",CMDBOOLTYPE|CMDMSG, &compsize, "boolean flag for output the LM size (default is false)",
00217 "sz",CMDBOOLTYPE|CMDMSG, &compsize, "boolean flag for output the LM size (default is false)",
00218
00219 "MaximumCachingLevel", CMDINTTYPE|CMDMSG , &max_caching_level, "maximum level for caches (default is: LM order - 1)",
00220 "mcl", CMDINTTYPE|CMDMSG, &max_caching_level, "maximum level for caches (default is: LM order - 1)",
00221
00222 "MemoryMap", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)",
00223 "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)",
00224 "mm", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)",
00225
00226 "CheckProb",CMDBOOLTYPE|CMDMSG, &checkpr, "boolean flag for checking probability distribution during test (default is false)",
00227 "cp",CMDBOOLTYPE|CMDMSG, &checkpr, "boolean flag for checking probability distribution during test (default is false)",
00228
00229 "OutProb",CMDSTRINGTYPE|CMDMSG, &outpr, "output file for debugging during test (default is \"/dev/null\")",
00230 "op",CMDSTRINGTYPE|CMDMSG, &outpr, "output file for debugging during test (default is \"/dev/null\")",
00231
00232 "SubLMInfo", CMDSTRINGTYPE|CMDMSG, &slminfo, "configuration file for the mixture LM",
00233 "slmi", CMDSTRINGTYPE|CMDMSG, &slminfo, "configuration file for the mixture LM",
00234
00235 "SaveMixParam", CMDSTRINGTYPE|CMDMSG, &omixpar, "output file for weights of the mixture LM",
00236 "smp", CMDSTRINGTYPE|CMDMSG, &omixpar, "output file for weights of the mixture LM",
00237
00238 "LoadMixParam", CMDSTRINGTYPE|CMDMSG, &imixpar, "input file for weights of the mixture LM",
00239 "lmp", CMDSTRINGTYPE|CMDMSG, &imixpar, "input file for weights of the mixture LM",
00240
00241 "SetOovRate", CMDDOUBLETYPE|CMDMSG, &oovrate, "rate for computing the OOV frequency (=oovrate*totfreq if oovrate>0) (default is 0)",
00242 "or", CMDDOUBLETYPE|CMDMSG, &oovrate, "rate for computing the OOV frequency (=oovrate*totfreq if oovrate>0) (default is 0)",
00243
00244 "Beta", CMDDOUBLETYPE|CMDMSG, &beta, "beta value for Shift Beta LM (default is -1, i.e. automatic estimation)",
00245 "beta", CMDDOUBLETYPE|CMDMSG, &beta, "beta value for Shift Beta LM (default is -1, i.e. automatic estimation)",
00246
00247 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00248 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00249
00250 (char *)NULL
00251 );
00252
00253 if (argc == 1){
00254 usage();
00255 }
00256
00257 GetParams(&argc, &argv, (char*) NULL);
00258
00259 if (help){
00260 usage();
00261 }
00262
00263 if (!lmtype) {
00264 std::cerr << "The lm type (-lm) is not specified" << std::endl;
00265 exit(1);
00266 }
00267
00268 if (!trainfile && lmtype!=MIXTURE) {
00269 std::cerr << "The LM file (-tr) is not specified" << std::endl;
00270 exit(1);
00271 }
00272
00273 if (SavePerLevel == false && backoff == true){
00274 cerr << "WARNING: Current implementation does not support the usage of backoff (-bo=true) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=false)." << endl;
00275 cerr << "WARNING: The usage of backoff is disabled, i.e. -bo=no is forced" << endl;
00276
00277 backoff=false;
00278 }
00279
00280 mdiadaptlm *lm=NULL;
00281
00282 switch (lmtype) {
00283
00284 case SHIFT_BETA:
00285 if (beta==-1 || (beta<1.0 && beta>0))
00286 lm=new shiftbeta(trainfile,size,prunefreq,beta,(backoff?SHIFTBETA_B:SHIFTBETA_I));
00287 else {
00288 cerr << "ShiftBeta: beta must be >0 and <1\n";
00289 exit(1);
00290 }
00291 break;
00292
00293 case MOD_SHIFT_BETA:
00294 if (size>1)
00295 lm=new mshiftbeta(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I));
00296 else {
00297 cerr << "Modified Shift Beta requires size > 1!\n";
00298 exit(1);
00299 }
00300
00301 break;
00302
00303 case SHIFT_ONE:
00304 lm=new shiftone(trainfile,size,prunefreq,(backoff?SIMPLE_B:SIMPLE_I));
00305 break;
00306
00307 case LINEAR_WB:
00308 lm=new linearwb(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I));
00309 break;
00310
00311 case LINEAR_GT:
00312 cerr << "This LM is no more supported\n";
00313 break;
00314
00315 case MIXTURE:
00316
00317
00318 lm=new mixture(SavePerLevel,slminfo,size,prunefreq,imixpar,omixpar);
00319 break;
00320
00321 default:
00322 cerr << "not implemented yet\n";
00323 return 1;
00324 };
00325
00326 if (dub < lm->dict->size()){
00327 cerr << "dub (" << dub << ") is not set or too small. dub is re-set to the dictionary size (" << lm->dict->size() << ")" << endl;
00328 dub = lm->dict->size();
00329 }
00330
00331 lm->dub(dub);
00332
00333 lm->create_caches(max_caching_level);
00334
00335 cerr << "eventually generate OOV code\n";
00336 lm->dict->genoovcode();
00337
00338 if (oovrate) lm->dict->setoovrate(oovrate);
00339
00340 lm->save_per_level(SavePerLevel);
00341
00342 lm->train();
00343
00344
00345 if (prunetopsingletons==true) {
00346 lm->prunetopsingletons(true);
00347 lm->prunesingletons(false);
00348 } else {
00349 lm->prunetopsingletons(false);
00350 if (prunesingletons==true) {
00351 lm->prunesingletons(true);
00352 } else {
00353 lm->prunesingletons(false);
00354 }
00355 }
00356
00357 if (adaptoov) lm->dict->incflag(1);
00358
00359 if (adaptfile) lm->adapt(adaptfile,adaptlevel,adaptrate);
00360
00361 if (adaptoov) lm->dict->incflag(0);
00362
00363 if (scalefactorfile) lm->savescalefactor(scalefactorfile);
00364
00365 if (backoff) lm->compute_backoff();
00366
00367 if (size>lm->maxlevel()) {
00368 cerr << "lm size is too large\n";
00369 exit(1);
00370 }
00371
00372 if (!size) size=lm->maxlevel();
00373
00374 if (testfile) {
00375 cerr << "TLM: test ...";
00376 lm->test(testfile,size,backoff,checkpr,outpr);
00377
00378 if (adaptfile)
00379 ((mdiadaptlm *)lm)->get_zetacache()->stat();
00380
00381 cerr << "\n";
00382 };
00383
00384 if (compsize)
00385 cout << "LM size " << (int)lm->netsize() << "\n";
00386
00387 if (interactive) {
00388
00389 ngram ng(lm->dict);
00390 int nsize=0;
00391
00392 cout.setf(ios::scientific);
00393
00394 switch (interactive) {
00395
00396 case NGRAM:
00397 cout << "> ";
00398 while(cin >> ng) {
00399 if (ng.wordp(size)) {
00400 cout << ng << " p=" << (double)log(lm->prob(ng,size)) << "\n";
00401 ng.size=0;
00402 cout << "> ";
00403 }
00404 }
00405 break;
00406
00407 case SEQUENCE: {
00408 char c;
00409 double p=0;
00410 cout << "> ";
00411
00412 while(cin >> ng) {
00413 nsize=ng.size<size?ng.size:size;
00414 p=log(lm->prob(ng,nsize));
00415 cout << ng << " p=" << p << "\n";
00416
00417 while((c=cin.get())==' ') {
00418 cout << c;
00419 }
00420 cin.putback(c);
00421
00422 if (c=='\n') {
00423 ng.size=0;
00424 cout << "> ";
00425 p=0;
00426 }
00427 }
00428 }
00429
00430 break;
00431
00432 case TURN: {
00433 int n=0;
00434 double lp=0;
00435 double oov=0;
00436
00437 while(cin >> ng) {
00438
00439 if (ng.size>0) {
00440 nsize=ng.size<size?ng.size:size;
00441 lp-=log(lm->prob(ng,nsize));
00442 n++;
00443 if (*ng.wordp(1) == lm->dict->oovcode())
00444 oov++;
00445 } else {
00446 if (n>0) cout << n << " " << lp/(log(2.0) * n) << " " << oov/n << "\n";
00447 n=0;
00448 lp=0;
00449 oov=0;
00450 }
00451 }
00452
00453 break;
00454 }
00455
00456 case TEXT: {
00457 int order;
00458
00459 int n=0;
00460 double lp=0;
00461 double oov=0;
00462
00463 while (!cin.eof()) {
00464 cin >> order;
00465 if (order>size)
00466 cerr << "Warning: order > lm size\n";
00467
00468 order=order>size?size:order;
00469
00470 while (cin >> ng) {
00471 if (ng.size>0) {
00472 nsize=ng.size<order?ng.size:order;
00473 lp-=log(lm->prob(ng,nsize));
00474 n++;
00475 if (*ng.wordp(1) == lm->dict->oovcode())
00476 oov++;
00477 } else {
00478 if (n>0) cout << n << " " << lp/(log(2.0)*n) << " " << oov/n << "\n";
00479 n=0;
00480 lp=0;
00481 oov=0;
00482 if (ng.isym>0) break;
00483 }
00484 }
00485 }
00486 }
00487 break;
00488
00489 case ADAPT: {
00490
00491 if (backoff) {
00492 cerr << "This modality is not supported with backoff LMs\n";
00493 exit(1);
00494 }
00495
00496 char afile[50],tfile[50];
00497 while (!cin.eof()) {
00498 cin >> afile >> tfile;
00499 system("echo > .tlmlock");
00500
00501 cerr << "interactive adaptation: "
00502 << afile << " " << tfile << "\n";
00503
00504 if (adaptoov) lm->dict->incflag(1);
00505 lm->adapt(afile,adaptlevel,adaptrate);
00506 if (adaptoov) lm->dict->incflag(0);
00507 if (scalefactorfile) lm->savescalefactor(scalefactorfile);
00508 if (ASRfile) lm->saveASR(ASRfile,backoff,dictfile);
00509 if (ARPAfile) lm->saveARPA(ARPAfile,backoff,dictfile);
00510 if (BINfile) lm->saveBIN(BINfile,backoff,dictfile,memmap);
00511 lm->test(tfile,size,checkpr);
00512 cout.flush();
00513 system("rm .tlmlock");
00514 }
00515 }
00516 break;
00517 }
00518
00519 exit(1);
00520 }
00521
00522 if (ASRfile) {
00523 cerr << "TLM: save lm (ASR)...";
00524 lm->saveASR(ASRfile,backoff,dictfile);
00525 cerr << "\n";
00526 }
00527
00528 if (ARPAfile) {
00529 cerr << "TLM: save lm (ARPA)...";
00530 lm->saveARPA(ARPAfile,backoff,dictfile);
00531 cerr << "\n";
00532 }
00533
00534 if (BINfile) {
00535 cerr << "TLM: save lm (binary)...";
00536 lm->saveBIN(BINfile,backoff,dictfile,memmap);
00537 cerr << "\n";
00538 }
00539
00540 if (statistics) {
00541 cerr << "TLM: lm stat ...";
00542 lm->lmstat(statistics);
00543 cerr << "\n";
00544 }
00545
00546
00547
00548 cerr << "TLM: deleting lm ...";
00549 delete lm;
00550 cerr << "\n";
00551
00552 exit(0);
00553 }
00554
00555
00556