00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 using namespace std;
00028
00029 #include <iostream>
00030 #include <cmath>
00031 #include "cmd.h"
00032 #include "mfstream.h"
00033 #include "mempool.h"
00034 #include "htable.h"
00035 #include "dictionary.h"
00036 #include "n_gram.h"
00037 #include "ngramtable.h"
00038
00039 void print_help(int TypeFlag=0){
00040 std::cerr << std::endl << "ngt - collects n-grams" << std::endl;
00041 std::cerr << std::endl << "USAGE:" << std::endl;
00042 std::cerr << " ngt -i=<inputfile> [options]" << std::endl;
00043 std::cerr << std::endl << "OPTIONS:" << std::endl;
00044
00045 FullPrintParams(TypeFlag, 0, 1, stderr);
00046 }
00047
00048 void usage(const char *msg = 0)
00049 {
00050 if (msg){
00051 std::cerr << msg << std::endl;
00052 }
00053 else{
00054 print_help();
00055 }
00056 exit(1);
00057 }
00058
00059 int main(int argc, char **argv)
00060 {
00061 char *inp=NULL;
00062 char *out=NULL;
00063 char *dic=NULL;
00064 char *subdic=NULL;
00065 char *filterdict=NULL;
00066 char *filtertable=NULL;
00067 char *iknfile=NULL;
00068 double filter_hit_rate=1.0;
00069 char *aug=NULL;
00070 char *hmask=NULL;
00071 bool inputgoogleformat=false;
00072 bool outputgoogleformat=false;
00073 int ngsz=0;
00074 int dstco=0;
00075 bool bin=false;
00076 bool ss=false;
00077 bool LMflag=false;
00078 int inplen=0;
00079 bool tlm=false;
00080 char* ftlm=NULL;
00081 bool memuse=false;
00082
00083 bool help=false;
00084
00085
00086 DeclareParams((char*)
00087 "Dictionary", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename",
00088 "d", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename",
00089
00090 "NgramSize", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0",
00091 "n", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0",
00092 "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file",
00093 "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file",
00094 "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file",
00095 "o", CMDSTRINGTYPE|CMDMSG, &out, "output file",
00096 "InputGoogleFormat", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false",
00097 "gooinp", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false",
00098 "OutputGoogleFormat", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false",
00099 "gooout", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false",
00100 "SaveBinaryTable", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false",
00101 "b", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false",
00102 "LmTable", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false",
00103 "lm", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false",
00104 "DistCo", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0",
00105 "dc", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0",
00106 "AugmentFile", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data",
00107 "aug", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data",
00108 "SaveSingle", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false",
00109 "ss", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false",
00110 "SubDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary",
00111 "sd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary",
00112 "FilterDict", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary",
00113 "fd", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary",
00114 "ConvDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary",
00115 "cd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary",
00116 "FilterTable", CMDSTRINGTYPE|CMDMSG, &filtertable, "ngramtable filename",
00117 "ftr", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "ngramtable filename",
00118 "FilterTableRate", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "minimum hit rate of filter; default is 1.0",
00119 "ft", CMDSTRINGTYPE|CMDMSG, &filtertable, "minimum hit rate of filter; default is 1.0",
00120 "HistoMask",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask",
00121 "hm",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask",
00122 "InpLen",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0",
00123 "il",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0",
00124 "tlm", CMDBOOLTYPE|CMDMSG, &tlm, "test LM table; default is false",
00125 "ftlm", CMDSTRINGTYPE|CMDMSG, &ftlm, "file to test LM table",
00126 "memuse", CMDBOOLTYPE|CMDMSG, &memuse, "default is false",
00127 "iknstat", CMDSTRINGTYPE|CMDMSG, &iknfile, "filename to save IKN statistics",
00128
00129 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00130 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00131
00132 (char *)NULL
00133 );
00134
00135
00136 if (argc == 1){
00137 usage();
00138 }
00139
00140 GetParams(&argc, &argv, (char*) NULL);
00141
00142 if (help){
00143 usage();
00144 }
00145
00146 if (inp==NULL) {
00147 usage("Warning: no input file specified\n");
00148 };
00149
00150 if (out==NULL) {
00151 cerr << "Warning: no output file specified!\n";
00152 }
00153
00154 TABLETYPE table_type=COUNT;
00155
00156 if (LMflag) {
00157 cerr << "Working with LM table\n";
00158 table_type=LEAFPROB;
00159 }
00160
00161
00162
00163
00164 if (filtertable) {
00165
00166 {
00167 ngramtable ngt(filtertable,1,NULL,NULL,NULL,0,0,NULL,0,table_type);
00168 mfstream inpstream(inp,ios::in);
00169 mfstream outstream(out,ios::out);
00170
00171 cerr << "Filtering table " << inp << " assumed to be in Google Format with size " << ngsz << "\n";
00172 cerr << "with table " << filtertable << " of size " << ngt.maxlevel() << "\n";
00173 cerr << "with hit rate " << filter_hit_rate << "\n";
00174
00175
00176 assert(ngt.maxlevel() <= ngsz);
00177
00178
00179
00180
00181
00182 ngram ng(ngt.dict), ng2(ng.dict);
00183 double hits=0;
00184 double maxhits=(double)(ngsz-ngt.maxlevel()+1);
00185
00186 long c=0;
00187 while(inpstream >> ng) {
00188
00189 if (ng.size>= ngt.maxlevel()) {
00190
00191 ng2=ng;
00192 ng2.size=ngt.maxlevel();
00193
00194 hits+=(ngt.get(ng2)?1:0);
00195 }
00196
00197 if (ng.size==ngsz) {
00198 if (!(++c % 1000000)) cerr << ".";
00199
00200
00201 inpstream >> ng.freq;
00202
00203 if (((hits/maxhits)>=filter_hit_rate) &&
00204 (!ng.containsWord(ngt.dict->OOV(),ng.size))
00205 )
00206 outstream << ng << "\n";
00207 hits=0;
00208 ng.size=0;
00209 }
00210 }
00211
00212 outstream.flush();
00213 inpstream.flush();
00214 }
00215
00216 exit(1);
00217 }
00218
00219
00220
00221
00222 ngramtable* ngt=new ngramtable(inp,ngsz,NULL,NULL,filterdict,inputgoogleformat,dstco,hmask,inplen,table_type);
00223
00224 if (aug) {
00225 ngt->dict->incflag(1);
00226
00227 ngramtable ngt2(aug,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type);
00228 ngt->augment(&ngt2);
00229 ngt->dict->incflag(0);
00230 }
00231
00232
00233 if (subdic) {
00234 int c=0;
00235
00236 ngramtable *ngt2=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type);
00237
00238
00239 dictionary tmpdict(subdic);
00240 ngt2->dict->incflag(1);
00241 for (int i=0; i<ngt->dict->size(); i++) {
00242 if (tmpdict.encode(ngt->dict->decode(i)) != tmpdict.oovcode()) {
00243 ngt2->dict->encode(ngt->dict->decode(i));
00244 }
00245 }
00246 ngt2->dict->incflag(0);
00247
00248 ngt2->dict->cleanfreq();
00249
00250
00251 if (ngt->dict->encode(ngt->dict->EoS())!=ngt->dict->oovcode()) {
00252 ngt2->dict->incflag(1);
00253 ngt2->dict->encode(ngt2->dict->EoS());
00254 ngt2->dict->incflag(0);
00255 }
00256 if (ngt->dict->encode(ngt->dict->BoS())!=ngt->dict->oovcode()) {
00257 ngt2->dict->incflag(1);
00258 ngt2->dict->encode(ngt2->dict->BoS());
00259 ngt2->dict->incflag(0);
00260 }
00261
00262
00263 ngram ng(ngt->dict);
00264 ngram ng2(ngt2->dict);
00265
00266 ngt->scan(ng,INIT,ngsz);
00267 while (ngt->scan(ng,CONT,ngsz)) {
00268 ng2.trans(ng);
00269 ngt2->put(ng2);
00270 if (!(++c % 1000000)) cerr << ".";
00271 }
00272
00273
00274 int oov=ngt2->dict->getcode(ngt2->dict->OOV());
00275 if(oov>=0) ngt2->dict->oovcode(oov);
00276
00277 for (int i=0; i<ngt->dict->size(); i++) {
00278 ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(i)),
00279 ngt->dict->freq(i));
00280 }
00281
00282 cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n";
00283
00284 delete ngt;
00285 ngt=ngt2;
00286
00287 }
00288
00289 if (ngsz < ngt->maxlevel() && hmask) {
00290 cerr << "start projection of ngramtable " << inp
00291 << " according to hmask\n";
00292
00293 int i,c;
00294 int selmask[MAX_NGRAM];
00295
00296
00297 i=0;
00298 selmask[i++]=1;
00299 for (c=0; c< (int)strlen(hmask); c++) {
00300 cerr << hmask[c] << "\n";
00301 if (hmask[c] == '1')
00302 selmask[i++]=c+2;
00303 }
00304
00305 if (i!= ngsz) {
00306 cerr << "wrong mask: 1 bits=" << i << " maxlev=" << ngsz << "\n";
00307 exit(1);
00308 }
00309
00310 if (selmask[ngsz-1] > ngt->maxlevel()) {
00311 cerr << "wrong mask: farest bits=" << selmask[ngsz-1]
00312 << " maxlev=" << ngt->maxlevel() << "\n";
00313 exit(1);
00314 }
00315
00316
00317 ngramtable* ngt2=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type);
00318
00319 ngt2->dict->incflag(1);
00320
00321 ngram ng(ngt->dict);
00322 ngram png(ngt->dict,ngsz);
00323 ngram ng2(ngt2->dict,ngsz);
00324
00325 ngt->scan(ng,INIT,ngt->maxlevel());
00326 while (ngt->scan(ng,CONT,ngt->maxlevel())) {
00327
00328 for (i=0; i<ngsz; i++)
00329 *png.wordp(i+1)=*ng.wordp(selmask[i]);
00330 png.freq=ng.freq;
00331
00332 ng2.trans(png);
00333 ngt2->put(ng2);
00334 if (!(++c % 1000000)) cerr << ".";
00335 }
00336
00337 char info[100];
00338 sprintf(info,"hm%s",hmask);
00339 ngt2->ngtype(info);
00340
00341
00342 int oov=ngt2->dict->getcode(ngt2->dict->OOV());
00343 if(oov>=0) ngt2->dict->oovcode(oov);
00344
00345 for (int i=0; i<ngt->dict->size(); i++) {
00346 ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(i)),
00347 ngt->dict->freq(i));
00348 }
00349
00350 cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n";
00351
00352 delete ngt;
00353 ngt=ngt2;
00354 }
00355
00356
00357 if (tlm && table_type==LEAFPROB) {
00358 ngram ng(ngt->dict);
00359 cout.setf(ios::scientific);
00360
00361 cout << "> ";
00362 while(cin >> ng) {
00363 ngt->bo_state(0);
00364 if (ng.size>=ngsz) {
00365 cout << ng << " p= " << log(ngt->prob(ng));
00366 cout << " bo= " << ngt->bo_state() << "\n";
00367 } else
00368 cout << ng << " p= NULL\n";
00369
00370 cout << "> ";
00371 }
00372
00373 }
00374
00375
00376 if (ftlm && table_type==LEAFPROB) {
00377
00378 ngram ng(ngt->dict);
00379 cout.setf(ios::fixed);
00380 cout.precision(2);
00381
00382 mfstream inptxt(ftlm,ios::in);
00383 int Nbo=0,Nw=0,Noov=0;
00384 float logPr=0,PP=0,PPwp=0;
00385
00386 int bos=ng.dict->encode(ng.dict->BoS());
00387
00388 while(inptxt >> ng) {
00389
00390
00391 if (*ng.wordp(1)==bos) {
00392 ng.size=1;
00393 continue;
00394 }
00395
00396 ngt->bo_state(0);
00397 if (ng.size>=1) {
00398 logPr+=log(ngt->prob(ng));
00399 if (*ng.wordp(1) == ngt->dict->oovcode())
00400 Noov++;
00401
00402 Nw++;
00403 if (ngt->bo_state()) Nbo++;
00404 }
00405 }
00406
00407 PP=exp(-logPr/Nw);
00408 PPwp= PP * exp(Noov * log(10000000.0-ngt->dict->size())/Nw);
00409
00410 cout << "%%% NGT TEST OF SMT LM\n";
00411 cout << "%% LM=" << inp << " SIZE="<< ngt->maxlevel();
00412 cout << " TestFile="<< ftlm << "\n";
00413 cout << "%% OOV PENALTY = 1/" << 10000000.0-ngt->dict->size() << "\n";
00414
00415
00416 cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp
00417 << " Nbo=" << Nbo << " Noov=" << Noov
00418 << " OOV=" << (float)Noov/Nw * 100.0 << "%\n";
00419
00420 }
00421
00422
00423 if (memuse) ngt->stat(0);
00424
00425
00426 if (iknfile) {
00427
00428 ngram ng(ngt->dict);
00429 int n1,n2,n3,n4;
00430 int unover3=0;
00431 mfstream iknstat(iknfile,ios::out);
00432
00433 for (int l=1; l<=ngt->maxlevel(); l++) {
00434
00435 cerr << "level " << l << "\n";
00436 iknstat << "level: " << l << " ";
00437
00438 cerr << "computing statistics\n";
00439
00440 n1=0;
00441 n2=0;
00442 n3=0,n4=0;
00443
00444 ngt->scan(ng,INIT,l);
00445
00446 while(ngt->scan(ng,CONT,l)) {
00447
00448
00449 if (l>1 && ng.containsWord(ngt->dict->OOV(),l)) {
00450
00451 continue;
00452 }
00453
00454
00455 if (l>1 && ng.containsWord(ngt->dict->EoS(),l-1)) {
00456
00457 continue;
00458 }
00459
00460
00461 if (l==1 && ng.containsWord(ngt->dict->BoS(),l)) {
00462
00463 continue;
00464 }
00465
00466 if (ng.freq==1) n1++;
00467 else if (ng.freq==2) n2++;
00468 else if (ng.freq==3) n3++;
00469 else if (ng.freq==4) n4++;
00470 if (l==1 && ng.freq >=3) unover3++;
00471
00472 }
00473
00474
00475 cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n";
00476 iknstat << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n";
00477
00478 }
00479
00480 }
00481
00482
00483 if (out)
00484 bin?ngt->savebin(out,ngsz): ngt->savetxt(out,ngsz,outputgoogleformat);
00485
00486
00487 }
00488