00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 using namespace std;
00024
00025 #include <iostream>
00026 #include <fstream>
00027 #include <vector>
00028 #include <string>
00029 #include <stdlib.h>
00030 #include "cmd.h"
00031 #include "util.h"
00032 #include "math.h"
00033 #include "lmContainer.h"
00034
00035
00036 void print_help(int TypeFlag=0){
00037 std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl;
00038 std::cerr << std::endl << "USAGE:" << std::endl;
00039 std::cerr << " compile-lm [options] <input-file.lm> [output-file.blm]" << std::endl;
00040 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00041 std::cerr << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl;
00042 std::cerr << " a compiled representation that the IRST LM toolkit can quickly" << std::endl;
00043 std::cerr << " read and process. LM file can be compressed." << std::endl;
00044 std::cerr << std::endl << "OPTIONS:" << std::endl;
00045
00046 FullPrintParams(TypeFlag, 0, 1, stderr);
00047 }
00048
00049 void usage(const char *msg = 0)
00050 {
00051 if (msg) {
00052 std::cerr << msg << std::endl;
00053 }
00054 if (!msg){
00055 print_help();
00056 }
00057 exit(1);
00058 }
00059
00060 int main(int argc, char **argv)
00061 {
00062 char *seval=NULL;
00063 char *tmpdir=NULL;
00064 char *sfilter=NULL;
00065
00066 bool textoutput = false;
00067 bool sent_PP_flag = false;
00068 bool invert = false;
00069 bool sscore = false;
00070 bool skeepunigrams = false;
00071
00072 int debug = 0;
00073 bool memmap = false;
00074 int requiredMaxlev = 1000;
00075 int dub = 10000000;
00076 int randcalls = 0;
00077 float ngramcache_load_factor = 0.0;
00078 float dictionary_load_factor = 0.0;
00079
00080 bool help=false;
00081 std::vector<std::string> files;
00082
00083 DeclareParams((char*)
00084 "text", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false",
00085 "t", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false",
00086 "filter", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list",
00087 "f", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list",
00088 "keepunigrams", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true",
00089 "ku", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true",
00090 "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file",
00091 "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file",
00092 "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
00093 "r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
00094 "score", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input",
00095 "s", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input",
00096 "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
00097 "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
00098 "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
00099 "l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
00100 "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM",
00101 "mm", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM",
00102 "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
00103 "tmpdir", CMDSTRINGTYPE|CMDMSG, &tmpdir, "directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")",
00104 "invert", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false",
00105 "i", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false",
00106 "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)",
00107 "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
00108 "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
00109
00110 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00111 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00112
00113 (char*)NULL
00114 );
00115
00116 if (argc == 1){
00117 usage();
00118 }
00119
00120 for(int i=1; i < argc; i++) {
00121 if(argv[i][0] != '-'){
00122 files.push_back(argv[i]);
00123 }
00124 }
00125
00126
00127 GetParams(&argc, &argv, (char*) NULL);
00128
00129 if (help){
00130 usage();
00131 }
00132
00133 if (files.size() > 2) {
00134 usage("Warning: Too many arguments");
00135 }
00136
00137 if (files.size() < 1) {
00138 usage("Warning: Please specify a LM file to read from");
00139 }
00140
00141
00142 OUTFILE_TYPE outtype;
00143 if (textoutput)
00144 outtype=TEXT;
00145 else if (seval != NULL || sscore)
00146 outtype=NONE;
00147 else
00148 outtype=BINARY;
00149
00150 std::string infile = files[0];
00151 std::string outfile = "";
00152
00153 if (files.size() == 1) {
00154 outfile=infile;
00155
00156
00157 std::string::size_type p = outfile.rfind('/');
00158 if (p != std::string::npos && ((p+1) < outfile.size()))
00159 outfile.erase(0,p+1);
00160
00161
00162 if (outfile.compare(outfile.size()-3,3,".gz")==0)
00163 outfile.erase(outfile.size()-3,3);
00164
00165 outfile+=(textoutput?".lm":".blm");
00166 } else{
00167 outfile = files[1];
00168 }
00169
00170 std::cerr << "inpfile: " << infile << std::endl;
00171 std::cerr << "outfile: " << outfile << std::endl;
00172 if (seval!=NULL) std::cerr << "evalfile: " << seval << std::endl;
00173 if (sscore==true) std::cerr << "interactive: " << sscore << std::endl;
00174 if (memmap) std::cerr << "memory mapping: " << memmap << std::endl;
00175 std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
00176 std::cerr << "dub: " << dub<< std::endl;
00177 if (tmpdir != NULL) {
00178 if (setenv("TMP",tmpdir,1))
00179 std::cerr << "temporary directory has not been set" << std::endl;
00180 std::cerr << "tmpdir: " << tmpdir << std::endl;
00181 }
00182
00183
00184
00185 lmContainer* lmt=NULL;
00186
00187 lmt = lmt->CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor);
00188
00189
00190 if (invert) lmt->is_inverted(invert);
00191
00192 lmt->setMaxLoadedLevel(requiredMaxlev);
00193
00194 lmt->load(infile);
00195
00196
00197 if (sfilter != NULL) {
00198 lmContainer* filtered_lmt = NULL;
00199 std::cerr << "BEFORE sublmC (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n";
00200
00201
00202 if (((lmContainer*) lmt)->filter(sfilter,filtered_lmt,skeepunigrams?"yes":"no")) {
00203 std::cerr << "BFR filtered_lmt (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n";
00204 filtered_lmt->stat();
00205 delete lmt;
00206 lmt=filtered_lmt;
00207 std::cerr << "AFTER filtered_lmt (" << (void*) filtered_lmt << ")\n";
00208 filtered_lmt->stat();
00209 std::cerr << "AFTER lmt (" << (void*) lmt << ")\n";
00210 lmt->stat();
00211 }
00212 }
00213
00214 if (dub) lmt->setlogOOVpenalty((int)dub);
00215
00216
00217 lmt->init_caches(lmt->maxlevel());
00218
00219 if (seval != NULL) {
00220 if (randcalls>0) {
00221
00222 cerr << "perform random " << randcalls << " using dictionary of test set\n";
00223 dictionary *dict;
00224 dict=new dictionary(seval);
00225
00226
00227 int histo[dict->totfreq()];
00228 int totfreq=0;
00229
00230 for (int n=0; n<dict->size(); n++)
00231 for (int m=0; m<dict->freq(n); m++)
00232 histo[totfreq++]=n;
00233
00234 ngram ng(lmt->getDict());
00235 srand(1234);
00236 double bow;
00237 int bol=0;
00238
00239 if (debug>1) ResetUserTime();
00240
00241 for (int n=0; n<randcalls; n++) {
00242
00243 int w=histo[rand() % totfreq];
00244
00245 ng.pushc(lmt->getDict()->encode(dict->decode(w)));
00246
00247 lmt->clprob(ng,&bow,&bol);
00248
00249 if (debug==1) {
00250 std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << lmt->maxlevel()-bol << "]" << " ";
00251 std::cout << std::endl;
00252 }
00253
00254 if ((n % 100000)==0) {
00255 std::cerr << ".";
00256 lmt->check_caches_levels();
00257 }
00258 }
00259 std::cerr << "\n";
00260 if (debug>1) PrintUserTime("Finished in");
00261 if (debug>1) lmt->stat();
00262
00263 delete lmt;
00264 return 0;
00265
00266 } else {
00267 if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
00268 debug = (debug>4)?4:debug;
00269 std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
00270 }
00271 std::cerr << "Start Eval" << std::endl;
00272 std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
00273 ngram ng(lmt->getDict());
00274 std::cout.setf(ios::fixed);
00275 std::cout.precision(2);
00276
00277
00278 std::fstream inptxt(seval,std::ios::in);
00279
00280 int Nbo=0, Nw=0,Noov=0;
00281 double logPr=0,PP=0,PPwp=0,Pr;
00282
00283
00284 int sent_Nbo=0, sent_Nw=0,sent_Noov=0;
00285 double sent_logPr=0,sent_PP=0,sent_PPwp=0;
00286
00287
00288 ng.dict->incflag(1);
00289 int bos=ng.dict->encode(ng.dict->BoS());
00290 int eos=ng.dict->encode(ng.dict->EoS());
00291 ng.dict->incflag(0);
00292
00293 double bow;
00294 int bol=0;
00295 char *msp;
00296 unsigned int statesize;
00297
00298 lmt->dictionary_incflag(1);
00299
00300 while(inptxt >> ng) {
00301
00302 if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel();
00303
00304
00305 if (*ng.wordp(1)==bos) {
00306 ng.size=1;
00307 continue;
00308 }
00309
00310 if (ng.size>=1) {
00311 Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize);
00312 logPr+=Pr;
00313 sent_logPr+=Pr;
00314
00315 if (debug==1) {
00316 std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " ";
00317 if (*ng.wordp(1)==eos) std::cout << std::endl;
00318 }
00319 if (debug==2) {
00320 std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr;
00321 std::cout << std::endl;
00322 }
00323 if (debug==3) {
00324 std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow;
00325 std::cout << std::endl;
00326 }
00327 if (debug==4) {
00328 std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
00329 std::cout << std::endl;
00330 }
00331 if (debug>4) {
00332 std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
00333 double totp=0.0;
00334 int oldw=*ng.wordp(1);
00335 double oovp=lmt->getlogOOVpenalty();
00336 lmt->setlogOOVpenalty((double) 0);
00337 for (int c=0; c<ng.dict->size(); c++) {
00338 *ng.wordp(1)=c;
00339 totp+=pow(10.0,lmt->clprob(ng));
00340 }
00341 *ng.wordp(1)=oldw;
00342
00343 if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5))
00344 std::cout << " [t=" << totp << "] POSSIBLE ERROR";
00345 std::cout << std::endl;
00346
00347 lmt->setlogOOVpenalty((double)oovp);
00348 }
00349
00350
00351 if (lmt->is_OOV(*ng.wordp(1))) {
00352 Noov++;
00353 sent_Noov++;
00354 }
00355 if (bol) {
00356 Nbo++;
00357 sent_Nbo++;
00358 }
00359 Nw++;
00360 sent_Nw++;
00361 if (sent_PP_flag && (*ng.wordp(1)==eos)) {
00362 sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw);
00363 sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw));
00364
00365 std::cout << "%% sent_Nw=" << sent_Nw
00366 << " sent_PP=" << sent_PP
00367 << " sent_PPwp=" << sent_PPwp
00368 << " sent_Nbo=" << sent_Nbo
00369 << " sent_Noov=" << sent_Noov
00370 << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
00371
00372 sent_Nw=sent_Noov=sent_Nbo=0;
00373 sent_logPr=0.0;
00374 }
00375
00376 if ((Nw % 100000)==0) {
00377 std::cerr << ".";
00378 lmt->check_caches_levels();
00379 }
00380
00381 }
00382 }
00383
00384 PP=exp((-logPr * log(10.0)) /Nw);
00385
00386 PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * log(10.0) / Nw));
00387
00388 std::cout << "%% Nw=" << Nw
00389 << " PP=" << PP
00390 << " PPwp=" << PPwp
00391 << " Nbo=" << Nbo
00392 << " Noov=" << Noov
00393 << " OOV=" << (float)Noov/Nw * 100.0 << "%";
00394 if (debug) std::cout << " logPr=" << logPr;
00395 std::cout << std::endl;
00396
00397 lmt->used_caches();
00398 lmt->stat();
00399
00400 if (debug>1) lmt->stat();
00401
00402 delete lmt;
00403 return 0;
00404 };
00405 }
00406
00407 if (sscore == true) {
00408
00409 ngram ng(lmt->getDict());
00410 int bos=ng.dict->encode(ng.dict->BoS());
00411
00412 int bol;
00413 double bow;
00414 unsigned int n=0;
00415
00416 std::cout.setf(ios::scientific);
00417 std::cout << "> ";
00418
00419 lmt->dictionary_incflag(1);
00420
00421 while(std::cin >> ng) {
00422
00423
00424
00425 if (*ng.wordp(1)==bos) {
00426 ng.size=1;
00427 continue;
00428 }
00429
00430 if (ng.size>=lmt->maxlevel()) {
00431 ng.size=lmt->maxlevel();
00432 ++n;
00433 if ((n % 100000)==0) {
00434 std::cerr << ".";
00435 lmt->check_caches_levels();
00436 }
00437 std::cout << ng << " p= " << lmt->clprob(ng,&bow,&bol) * M_LN10;
00438 std::cout << " bo= " << bol << std::endl;
00439 } else {
00440 std::cout << ng << " p= NULL" << std::endl;
00441 }
00442 std::cout << "> ";
00443 }
00444 std::cout << std::endl;
00445 if (debug>1) lmt->stat();
00446 delete lmt;
00447 return 0;
00448 }
00449
00450 if (textoutput == true) {
00451 std::cerr << "Saving in txt format to " << outfile << std::endl;
00452 lmt->savetxt(outfile.c_str());
00453 } else if (!memmap) {
00454 std::cerr << "Saving in bin format to " << outfile << std::endl;
00455 lmt->savebin(outfile.c_str());
00456 } else {
00457 std::cerr << "Impossible to save to " << outfile << std::endl;
00458 }
00459 delete lmt;
00460 return 0;
00461 }
00462