00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "mfstream.h"
00024 #include <stdio.h>
00025 #include <stdlib.h>
00026 #include <iomanip>
00027 #include <iostream>
00028 #include <fstream>
00029 #include "mempool.h"
00030 #include "htable.h"
00031 #include "dictionary.h"
00032 #include "index.h"
00033 #include "util.h"
00034
00035 using namespace std;
00036
00037 dictionary::dictionary(char *filename,int size, float lf)
00038 {
00039 if (lf<=0.0) lf=DICTIONARY_LOAD_FACTOR;
00040 load_factor=lf;
00041
00042 htb = new HASHTABLE_t((size_t) (size/load_factor));
00043 tb = new dict_entry[size];
00044 st = new strstack(size * 10);
00045
00046 for (int i=0; i<size; i++) tb[i].freq=0;
00047
00048 oov_code = -1;
00049
00050 n = 0;
00051 N = 0;
00052 dubv = 0;
00053 lim = size;
00054 ifl=0;
00055
00056 if (filename==NULL) return;
00057
00058 mfstream inp(filename,ios::in);
00059
00060 if (!inp) {
00061 cerr << "cannot open " << filename << "\n";
00062 exit(1);
00063 }
00064
00065 char buffer[100];
00066
00067 inp >> setw(100) >> buffer;
00068
00069 inp.close();
00070
00071 if ((strncmp(buffer,"dict",4)==0) ||
00072 (strncmp(buffer,"DICT",4)==0))
00073 load(filename);
00074 else
00075 generate(filename);
00076
00077 cerr << "loaded \n";
00078
00079 }
00080
00081
00082
00083 int dictionary::getword(fstream& inp , char* buffer)
00084 {
00085
00086
00087 while(inp >> setw(MAX_WORD) >> buffer) {
00088
00089
00090
00091 if (strlen(buffer)==(MAX_WORD-1)) {
00092 cerr << "getword: a very long word was read ("
00093 << buffer << ")\n";
00094 }
00095
00096
00097 if (strlen(buffer)==0) {
00098 cerr << "zero length word!\n";
00099 continue;
00100 }
00101
00102
00103 return 1;
00104
00105 }
00106
00107 return 0;
00108 }
00109
00110
00111 void dictionary::generate(char *filename)
00112 {
00113
00114 char buffer[MAX_WORD];
00115 int counter=0;
00116
00117 mfstream inp(filename,ios::in);
00118
00119 if (!inp) {
00120 cerr << "cannot open " << filename << "\n";
00121 exit(1);
00122 }
00123
00124 cerr << "dict:";
00125
00126 ifl=1;
00127
00128 while (getword(inp,buffer)) {
00129
00130 incfreq(encode(buffer),1);
00131
00132 if (!(++counter % 1000000)) cerr << ".";
00133 }
00134
00135 ifl=0;
00136
00137 cerr << "\n";
00138
00139 inp.close();
00140
00141 }
00142
00143 void dictionary::augment(dictionary *d)
00144 {
00145 incflag(1);
00146 for (int i=0; i<d->n; i++)
00147 encode(d->decode(i));
00148 incflag(0);
00149 encode(OOV());
00150 }
00151
00152
00153
00154
00155
00156
00157
00158 void dictionary::print_curve(int curvesize, float* testOOV)
00159 {
00160
00161 int* curve = new int[curvesize];
00162 for (int i=0; i<curvesize; i++) curve[i]=0;
00163
00164
00165 for (int i=0; i<n; i++) {
00166 if(tb[i].freq > curvesize-1)
00167 curve[curvesize-1]++;
00168 else
00169 curve[tb[i].freq-1]++;
00170 }
00171
00172
00173 for (int i=curvesize-2; i>=0; i--) {
00174 curve[i] = curve[i] + curve[i+1];
00175 }
00176
00177 cout.setf(ios::fixed);
00178 cout << "Dict size: " << n << "\n";
00179 cout << "**************** DICTIONARY GROWTH CURVE ****************\n";
00180 cout << "Freq\tEntries\tPercent";
00181 if(testOOV!=NULL)
00182 cout << "\t\tFreq\tOOV onTest";
00183 cout << "\n";
00184
00185 for (int i=0; i<curvesize; i++) {
00186
00187 cout << ">" << i << "\t" << curve[i] << "\t" << setprecision(2) << (float)curve[i]/n * 100.0 << "%";
00188
00189
00190 if(testOOV!=NULL)
00191 cout << "\t\t<" << i+1<< "\t" << testOOV[i] << "%";
00192 cout << "\n";
00193 }
00194 cout << "*********************************************************\n";
00195 }
00196
00197
00198
00199
00200
00201
00202 float* dictionary::test(int curvesize, const char *filename, int listflag)
00203 {
00204
00205 int NwTest=0;
00206 int* OOVchart = new int[curvesize];
00207 for (int j=0; j<curvesize; j++) OOVchart[j]=0;
00208 char buffer[MAX_WORD];
00209
00210 const char* bos = BoS();
00211
00212 int k;
00213 mfstream inp(filename,ios::in);
00214
00215 if (!inp) {
00216 cerr << "cannot open test: " << filename << "\n";
00217
00218 return NULL;
00219 }
00220 cerr << "test:";
00221
00222 k=0;
00223 while (getword(inp,buffer)) {
00224
00225
00226 if (strcmp(buffer,bos)==0)
00227 continue;
00228
00229 int freq = 0;
00230 int wCode = getcode(buffer);
00231 if(wCode!=-1) freq = tb[wCode].freq;
00232
00233 if(freq==0) {
00234 OOVchart[0]++;
00235 if(listflag) {
00236 cerr << "<OOV>" << buffer << "</OOV>\n";
00237 }
00238 } else {
00239 if(freq < curvesize) OOVchart[freq]++;
00240 }
00241 NwTest++;
00242 if (!(++k % 1000000)) cerr << ".";
00243 }
00244 cerr << "\n";
00245 inp.close();
00246 cout << "nb words of test: " << NwTest << "\n";
00247
00248
00249 for (int i=1; i<curvesize; i++)
00250 OOVchart[i] = OOVchart[i] + OOVchart[i-1];
00251
00252
00253 float* OOVrates = new float[curvesize];
00254 for (int i=0; i<curvesize; i++)
00255 OOVrates[i] = (float)OOVchart[i]/NwTest * 100.0;
00256
00257 return OOVrates;
00258 }
00259
00260 void dictionary::load(char* filename)
00261 {
00262 char header[100];
00263 char* addr;
00264 char buffer[MAX_WORD];
00265 int freqflag=0;
00266
00267 mfstream inp(filename,ios::in);
00268
00269 if (!inp) {
00270 cerr << "\ncannot open " << filename << "\n";
00271 exit(1);
00272 }
00273
00274 cerr << "dict:";
00275
00276 inp.getline(header,100);
00277 if (strncmp(header,"DICT",4)==0)
00278 freqflag=1;
00279 else if (strncmp(header,"dict",4)!=0) {
00280 cerr << "\ndictionary file " << filename << " has a wrong header\n";
00281 exit(1);
00282 }
00283
00284
00285 while (getword(inp,buffer)) {
00286
00287 tb[n].word=st->push(buffer);
00288 tb[n].code=n;
00289
00290 if (freqflag)
00291 inp >> tb[n].freq;
00292 else
00293 tb[n].freq=0;
00294
00295
00296 if ((addr=htb->insert((char*)&tb[n].word))) {
00297 if (addr!=(char *)&tb[n].word) {
00298 cerr << "dictionary::loadtxt wrong entry was found ("
00299 << buffer << ") in position " << n << "\n";
00300
00301 continue;
00302 }
00303 }
00304
00305 N+=tb[n].freq;
00306
00307 if (strcmp(buffer,OOV())==0) oov_code=n;
00308
00309 if (++n==lim) grow();
00310
00311 }
00312
00313 inp.close();
00314 }
00315
00316
00317 void dictionary::load(std::istream& inp)
00318 {
00319
00320 char buffer[MAX_WORD];
00321 char *addr;
00322 int size;
00323
00324 inp >> size;
00325
00326 for (int i=0; i<size; i++) {
00327
00328 inp >> setw(MAX_WORD) >> buffer;
00329
00330 tb[n].word=st->push(buffer);
00331 tb[n].code=n;
00332 inp >> tb[n].freq;
00333 N+=tb[n].freq;
00334
00335
00336 if ((addr=htb->insert((char *)&tb[n].word))) {
00337 if (addr!=(char *)&tb[n].word) {
00338 cerr << "dictionary::loadtxt wrong entry was found ("
00339 << buffer << ") in position " << n << "\n";
00340 exit(1);
00341 }
00342 }
00343
00344 if (strcmp(tb[n].word,OOV())==0)
00345 oov_code=n;
00346
00347 if (++n==lim) grow();
00348 }
00349 inp.getline(buffer,MAX_WORD-1);
00350 }
00351
00352
00353 void dictionary::save(std::ostream& out)
00354 {
00355 out << n << "\n";
00356 for (int i=0; i<n; i++)
00357 out << tb[i].word << " " << tb[i].freq << "\n";
00358 }
00359
00360 int cmpdictentry(const void *a,const void *b)
00361 {
00362 dict_entry *ae=(dict_entry *)a;
00363 dict_entry *be=(dict_entry *)b;
00364
00365 if (be->freq-ae->freq)
00366 return be->freq-ae->freq;
00367 else
00368 return strcmp(ae->word,be->word);
00369
00370 }
00371
00372
00373 dictionary::dictionary(dictionary* d,bool prune, int prunethresh)
00374 {
00375 assert(d!=NULL);
00376
00377 n=0;
00378 N=0;
00379
00380 load_factor=d->load_factor;
00381 lim=d->lim;
00382 oov_code=-1;
00383 ifl=0;
00384 dubv=d->dubv;
00385
00386
00387 tb = new dict_entry[lim];
00388 htb = new HASHTABLE_t((size_t) (lim/load_factor));
00389 st = new strstack(lim * 10);
00390
00391
00392 n=0;
00393 for (int i=0; i<d->n; i++)
00394 if (!prune || d->tb[i].freq>=prunethresh){
00395 tb[n].code=n;
00396 tb[n].freq=d->tb[i].freq;
00397 tb[n].word=st->push(d->tb[i].word);
00398 htb->insert((char*)&tb[n].word);
00399
00400 if (d->oov_code==i) oov_code=n;
00401
00402 N+=tb[n].freq;
00403 n++;
00404 }
00405 };
00406
00407 void dictionary::sort()
00408 {
00409 if (htb != NULL ) delete htb;
00410
00411 htb = new HASHTABLE_t((int) (lim/load_factor));
00412
00413 cerr << "sorting dictionary ...";
00414 qsort(tb,n,sizeof(dict_entry),cmpdictentry);
00415 cerr << "done\n";
00416
00417 for (int i=0; i<n; i++) {
00418
00419 if (oov_code==tb[i].code) oov_code=i;
00420 tb[i].code=i;
00421
00422 htb->insert((char*)&tb[i].word);
00423 };
00424
00425 }
00426
00427 dictionary::~dictionary()
00428 {
00429 delete htb;
00430 delete st;
00431 delete [] tb;
00432 }
00433
00434 void dictionary::stat()
00435 {
00436 cout << "dictionary class statistics\n";
00437 cout << "size " << n
00438 << " used memory "
00439 << (lim * sizeof(int) +
00440 htb->used() +
00441 st->used())/1024 << " Kb\n";
00442 }
00443
00444 void dictionary::grow()
00445 {
00446 delete htb;
00447
00448 cerr << "+\b";
00449
00450 int newlim=(int) (lim*GROWTH_STEP);
00451 dict_entry *tb2=new dict_entry[newlim];
00452
00453 memcpy(tb2,tb,sizeof(dict_entry) * lim );
00454
00455 delete [] tb;
00456 tb=tb2;
00457
00458 htb=new HASHTABLE_t((size_t) ((newlim)/load_factor));
00459 for (int i=0; i<lim; i++) {
00460
00461 htb->insert((char*)&tb[i].word);
00462 }
00463
00464 for (int i=lim; i<newlim; i++) tb[i].freq=0;
00465
00466 lim=newlim;
00467 }
00468
00469 void dictionary::save(char *filename,int freqflag)
00470 {
00471
00472 std::ofstream out(filename,ios::out);
00473
00474 if (!out) {
00475 cerr << "cannot open " << filename << "\n";
00476 }
00477
00478
00479 if (freqflag)
00480 out << "DICTIONARY 0 " << n << "\n";
00481 else
00482 out << "dictionary 0 " << n << "\n";
00483
00484 for (int i=0; i<n; i++)
00485 if (tb[i].freq) {
00486 out << tb[i].word;
00487 if (freqflag)
00488 out << " " << tb[i].freq;
00489 out << "\n";
00490 }
00491
00492 out.close();
00493 }
00494
00495
00496 int dictionary::getcode(const char *w)
00497 {
00498 dict_entry* ptr=(dict_entry *)htb->find((char *)&w);
00499 if (ptr==NULL) return -1;
00500 return ptr->code;
00501 }
00502
00503 int dictionary::encode(const char *w)
00504 {
00505
00506 if (strlen(w)==0) {
00507 cerr << "0";
00508 w=OOV();
00509 }
00510
00511
00512 dict_entry* ptr;
00513
00514 if ((ptr=(dict_entry *)htb->find((char *)&w))!=NULL)
00515 return ptr->code;
00516 else {
00517 if (!ifl) {
00518 if (oov_code==-1) {
00519 cerr << "starting to use OOV words [" << w << "]\n";
00520 tb[n].word=st->push(OOV());
00521 htb->insert((char *)&tb[n].word);
00522 tb[n].code=n;
00523 tb[n].freq=0;
00524 oov_code=n;
00525 if (++n==lim) grow();
00526 }
00527
00528 return encode(OOV());
00529 } else {
00530 tb[n].word=st->push((char *)w);
00531 htb->insert((char*)&tb[n].word);
00532 tb[n].code=n;
00533 tb[n].freq=0;
00534 if (++n==lim) grow();
00535 return n-1;
00536 }
00537 }
00538 }
00539
00540
00541 const char *dictionary::decode(int c)
00542 {
00543 if (c>=0 && c < n)
00544 return tb[c].word;
00545 else {
00546 cerr << "decode: code out of boundary\n";
00547 return OOV();
00548 }
00549 }
00550
00551
00552 dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict)
00553 {
00554 m_dict->scan(HT_INIT);
00555 }
00556
00557 dict_entry* dictionary_iter::next()
00558 {
00559 return (dict_entry*) m_dict->scan(HT_CONT);
00560 }
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571