00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <fcntl.h>
00025 #include <iostream>
00026 #include <fstream>
00027 #include <stdexcept>
00028 #include <cassert>
00029 #include "math.h"
00030 #include "mempool.h"
00031 #include "htable.h"
00032 #include "ngramcache.h"
00033 #include "dictionary.h"
00034 #include "n_gram.h"
00035 #include "lmtable.h"
00036 #include "lmmacro.h"
00037 #include "util.h"
00038
00039 using namespace std;
00040
00041
00042
00043 inline void error(const char* message)
00044 {
00045 cerr << message << "\n";
00046 throw runtime_error(message);
00047 }
00048
00049
00050
00051
00052
00053 lmmacro::lmmacro(float nlf, float dlfi):lmtable(nlf,dlfi)
00054 {
00055 dict = new dictionary((char *)NULL,1000000);
00056 getDict()->incflag(1);
00057 };
00058
00059 lmmacro::~lmmacro()
00060 {
00061 if (mapFlag) unloadmap();
00062 }
00063
00064
00065 void lmmacro::load(const std::string filename,int memmap)
00066 {
00067 VERBOSE(2,"lmmacro::load(const std::string filename,int memmap)" << std::endl);
00068
00069
00070 fstream inp(filename.c_str(),ios::in|ios::binary);
00071
00072 char line[MAX_LINE];
00073 const char* words[MAX_TOKEN_N_MAP];
00074 int tokenN;
00075 inp.getline(line,MAX_LINE,'\n');
00076 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00077
00078 if (tokenN != 4 || ((strcmp(words[0],"LMMACRO") != 0) && (strcmp(words[0],"lmmacro")!=0)))
00079 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
00080 maxlev = atoi(words[1]);
00081 selectedField = atoi(words[2]);
00082
00083 if ((strcmp(words[3],"TRUE") == 0) || (strcmp(words[3],"true") == 0))
00084 collapseFlag = true;
00085 else if ((strcmp(words[3],"FALSE") == 0) || (strcmp(words[3],"false") == 0))
00086 collapseFlag = false;
00087 else
00088 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
00089
00090 #ifdef DLEXICALLM
00091 selectedFieldForLexicon = atoi(words[3]);
00092 collapseFlag = atoi(words[4]);
00093 #endif
00094
00095 if (selectedField == -1)
00096 cerr << "no selected field: the whole string is used" << std::endl;
00097 else
00098 cerr << "selected field n. " << selectedField << std::endl;
00099 if (collapseFlag)
00100 cerr << "collapse is enabled" << std::endl;
00101 else
00102 cerr << "collapse is disabled" << std::endl;
00103
00104
00105 std::string lmfilename;
00106 if (inp.getline(line,MAX_LINE,'\n')) {
00107 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00108 lmfilename = words[0];
00109 } else
00110 error((char*)"ERROR: wrong format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
00111
00112 std::string mapfilename = "";
00113 if (inp.getline(line,MAX_LINE,'\n')) {
00114 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00115 mapfilename = words[0];
00116 mapFlag = true;
00117 } else {
00118 mapFlag = false;
00119 }
00120
00121 inp.close();
00122
00123
00124 std::cerr << "lmfilename:" << lmfilename << std::endl;
00125 if (mapfilename != "") {
00126 std::cerr << "mapfilename:" << mapfilename << std::endl;
00127 } else {
00128 std::cerr << "no mapfilename" << std::endl;
00129 mapFlag = false;
00130 }
00131
00132
00133 getDict()->incflag(1);
00134
00135
00136 if ((!mapFlag) && (collapseFlag)) {
00137 error((char*)"ERROR: you must specify a map if you want to collapse a specific field!");
00138 }
00139 #ifdef DLEXICALLM
00140
00141 std::string lexicalclassesfilename = lexicalclassesfilename = words[2];
00142 if (lexicalclassesfilename != "NULL" && lexicalclassesfilename != "null") lexicalclassesfilename = "";
00143
00144 if (lexicalclassesfilename != "") std::cerr << "lexicalclassesfilename:" << lexicalclassesfilename << std::endl;
00145 else std::cerr << "no lexicalclassesfilename" << std::endl;
00146
00147
00148 if (lexicalclassesfilename != "") loadLexicalClasses(lexicalclassesfilename.c_str());
00149 #endif
00150
00151
00152 lmtable::load(lmfilename,memmap);
00153
00154
00155 if (mapFlag)
00156 loadmap(mapfilename);
00157
00158 getDict()->genoovcode();
00159 getDict()->incflag(1);
00160 };
00161
00162 void lmmacro::unloadmap()
00163 {
00164 delete dict;
00165 free(microMacroMap);
00166 if (collapseFlag) {
00167 free(collapsableMap);
00168 free(collapsatorMap);
00169 }
00170 #ifdef DLEXICALLM
00171 free(lexicaltoken2classMap);
00172 #endif
00173 }
00174
00175 void lmmacro::loadmap(const std::string mapfilename)
00176 {
00177 microMacroMapN = 0;
00178 microMacroMap = NULL;
00179 collapsableMap = NULL;
00180 collapsatorMap = NULL;
00181
00182 #ifdef DLEXICALLM
00183 lexicaltoken2classMap = NULL;
00184 lexicaltoken2classMapN = 0;
00185 #endif
00186
00187 microMacroMap = (int *)calloc(BUFSIZ, sizeof(int));
00188 if (collapseFlag) {
00189 collapsableMap = (bool *)calloc(BUFSIZ, sizeof(bool));
00190 collapsatorMap = (bool *)calloc(BUFSIZ, sizeof(bool));
00191 }
00192
00193 if (lmtable::getDict()->getcode(BOS_)==-1) {
00194 lmtable::getDict()->incflag(1);
00195 lmtable::getDict()->encode(BOS_);
00196 lmtable::getDict()->incflag(0);
00197 }
00198
00199 if (lmtable::getDict()->getcode(EOS_)==-1) {
00200 lmtable::getDict()->incflag(1);
00201 lmtable::getDict()->encode(EOS_);
00202 lmtable::getDict()->incflag(0);
00203 }
00204
00205 char line[MAX_LINE];
00206 const char* words[MAX_TOKEN_N_MAP];
00207 const char *macroW;
00208 const char *microW;
00209 int tokenN;
00210 bool bos=false,eos=false;
00211
00212
00213 inputfilestream inpMap(mapfilename.c_str());
00214 std::cerr << "Reading map " << mapfilename << "..." << std::endl;
00215 while (inpMap.getline(line,MAX_LINE,'\n')) {
00216 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00217 if (tokenN != 2)
00218 error((char*)"ERROR: wrong format of map file\n");
00219 microW = words[0];
00220 macroW = words[1];
00221 getDict()->encode(microW);
00222
00223
00224 if (microMacroMapN>0 && !(microMacroMapN % BUFSIZ)) {
00225 microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
00226 if (collapseFlag) {
00227
00228
00229 collapsableMap = (bool *)realloc(collapsableMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
00230 collapsatorMap = (bool *)realloc(collapsatorMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
00231 }
00232 }
00233 microMacroMap[microMacroMapN] = lmtable::getDict()->getcode(macroW);
00234
00235 if (collapseFlag) {
00236
00237 int len = strlen(microW)-1;
00238 if (microW[len] == '(') {
00239 collapsableMap[microMacroMapN] = false;
00240 collapsatorMap[microMacroMapN] = true;
00241 } else if (microW[len] == ')') {
00242 collapsableMap[microMacroMapN] = true;
00243 collapsatorMap[microMacroMapN] = false;
00244 } else if (microW[len] == '+') {
00245 collapsableMap[microMacroMapN] = true;
00246 collapsatorMap[microMacroMapN] = true;
00247 } else {
00248 collapsableMap[microMacroMapN] = false;
00249 collapsatorMap[microMacroMapN] = false;
00250 }
00251 }
00252
00253 if (!bos && !strcmp(microW,BOS_)) bos=true;
00254 if (!eos && !strcmp(microW,EOS_)) eos=true;
00255
00256 VERBOSE(2,"\nmicroW = " << microW << "\n"
00257 << "macroW = " << macroW << "\n"
00258 << "microMacroMapN = " << microMacroMapN << "\n"
00259 << "code of micro = " << getDict()->getcode(microW) << "\n"
00260 << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n");
00261
00262 microMacroMapN++;
00263 }
00264
00265 if ((microMacroMapN == 0) && (selectedField == -1))
00266 error((char*)"ERROR: with no field selection, a map for the whole string is mandatory\n");
00267
00268 if (microMacroMapN>0) {
00269
00270 if (!bos) {
00271 getDict()->encode(BOS_);
00272 if (microMacroMapN && !(microMacroMapN%BUFSIZ))
00273 microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
00274 microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(BOS_);
00275 }
00276
00277
00278 if (!eos) {
00279 getDict()->encode(EOS_);
00280 if (microMacroMapN && !(microMacroMapN%BUFSIZ))
00281 microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
00282 microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(EOS_);
00283 }
00284 }
00285
00286
00287 VERBOSE(2,"oovcode(micro)=" << getDict()->oovcode() << "\n"
00288 << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n"
00289 << "microMacroMapN = " << microMacroMapN << "\n"
00290 << "macrodictsize = " << getDict()->size() << "\n"
00291 << "microdictsize = " << lmtable::getDict()->size() << "\n");
00292
00293 IFVERBOSE(2) {
00294 for (int i=0; i<microMacroMapN; i++) {
00295 VERBOSE(2,"micro[" << getDict()->decode(i) << "] -> " << lmtable::getDict()->decode(microMacroMap[i]) << "\n");
00296 }
00297 }
00298 std::cerr << "...done\n";
00299 }
00300
00301
00302 double lmmacro::lprob(ngram micro_ng)
00303 {
00304 VERBOSE(2,"lmmacro::lprob, parameter = <" << micro_ng << ">\n");
00305
00306 ngram macro_ng(lmtable::getDict());
00307
00308 if (micro_ng.dict == macro_ng.dict)
00309 macro_ng.trans(micro_ng);
00310 else
00311 map(µ_ng, ¯o_ng);
00312
00313 VERBOSE(3,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
00314 << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
00315
00316
00317 double prob;
00318 prob = lmtable::lprob(macro_ng);
00319 VERBOSE(3,"prob = " << prob << "\n");
00320
00321 return prob;
00322 };
00323
00324 double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
00325 {
00326 ngram micro_ng(getDict());
00327 micro_ng.pushc(codes,sz);
00328 return clprob(micro_ng,bow,bol,state,statesize,extendible);
00329 }
00330
00331 double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
00332 {
00333
00334 VERBOSE(3," lmmacro::clprob(ngram), parameter = <" << micro_ng << ">\n");
00335
00336 ngram transformed_ng(lmtable::getDict());
00337 bool collapsed = transform(micro_ng, transformed_ng);
00338 VERBOSE(3,"lmmacro::clprob(ngram), transformed_ng = <" << transformed_ng << ">\n");
00339
00340 double logpr;
00341
00342 if (collapsed) {
00343
00344
00345 VERBOSE(3," SKIPPED call to lmtable::clprob because of collapse; logpr: 0.0\n");
00346 logpr = 0.0;
00347 } else {
00348 VERBOSE(3," QUERY MACRO LM on (after transformation and size reduction) " << transformed_ng << "\n");
00349 logpr = lmtable::clprob(transformed_ng, bow, bol, state, statesize, extendible);
00350 }
00351 VERBOSE(3," GET logpr: " << logpr << "\n");
00352
00353 return logpr;
00354 }
00355
00356 bool lmmacro::transform(ngram &in, ngram &out)
00357 {
00358 VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), in = <" << in << ">\n");
00359
00360
00361 ngram field_ng(getDict());
00362 if (selectedField >= 0)
00363 field_selection(in, field_ng);
00364 else
00365 field_ng = in;
00366
00367
00368 ngram collapsed_ng(getDict());
00369 bool collapsed = false;
00370 if (collapseFlag)
00371 collapsed = collapse(field_ng, collapsed_ng);
00372 else
00373 collapsed_ng = field_ng;
00374
00375
00376 if (mapFlag)
00377 mapping(collapsed_ng, out);
00378 else
00379 out.trans(collapsed_ng);
00380
00381 if (out.size>lmtable::maxlevel()) out.size=lmtable::maxlevel();
00382
00383 VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), out = <" << out << ">\n");
00384 return collapsed;
00385 }
00386
00387
00388
00389 void lmmacro::field_selection(ngram &in, ngram &out)
00390 {
00391 VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) in = " << in << "\n");
00392
00393 int microsize = in.size;
00394
00395 for (int i=microsize; i>0; i--) {
00396
00397 char curr_token[BUFSIZ];
00398 strcpy(curr_token, getDict()->decode(*in.wordp(i)));
00399 char *field;
00400 if (strcmp(curr_token,"<s>") &&
00401 strcmp(curr_token,"</s>") &&
00402 strcmp(curr_token,"_unk_")) {
00403 field = strtok(curr_token, "#");
00404 int j=0;
00405 while (j<selectedField && field != NULL) {
00406 field = strtok(0, "#");
00407 j++;
00408 }
00409 } else {
00410 field = curr_token;
00411 }
00412
00413
00414 if (field) {
00415 out.pushw(field);
00416 } else {
00417
00418 out.pushw((char*)"_unk_");
00419
00420
00421
00422
00437 }
00438 }
00439 VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) out = " << out << "\n");
00440 return;
00441 }
00442
00443 bool lmmacro::collapse(ngram &in, ngram &out)
00444 {
00445 VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) in = " << in << "\n")
00446
00447
00448
00449
00450 int microsize = in.size;
00451 out.size = 0;
00452
00453 if (microsize == 1) {
00454 out.pushc(*in.wordp(1));
00455 return false;
00456 }
00457
00458 int curr_code = *in.wordp(1);
00459 int prev_code = *in.wordp(2);
00460
00461 if (microMacroMap[curr_code] == microMacroMap[prev_code]) {
00462 if (collapsableMap[curr_code] && collapsatorMap[prev_code]) {
00463 return true;
00464 }
00465 }
00466
00467
00468
00469
00470 prev_code = *in.wordp(microsize);
00471 out.pushc(prev_code);
00472
00473 for (int i=microsize-1; i>1; i--) {
00474
00475 curr_code = *in.wordp(i);
00476
00477 if (microMacroMap[curr_code] != microMacroMap[prev_code]) {
00478 out.pushc(curr_code);
00479 } else {
00480 if (!(collapsableMap[curr_code] && collapsatorMap[prev_code])) {
00481 out.pushc(prev_code);
00482 }
00483 }
00484 prev_code = curr_code;
00485 }
00486
00487 out.pushc(*in.wordp(1));
00488 VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) out = " << out << "\n");
00489 return false;
00490 }
00491
00492 void lmmacro::mapping(ngram &in, ngram &out)
00493 {
00494 VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) in = " << in << "\n");
00495
00496 int microsize = in.size;
00497
00498
00499
00500 for (int i=microsize; i>0; i--) {
00501
00502 int in_code = *in.wordp(i);
00503 int out_code;
00504 if (in_code < microMacroMapN)
00505 out_code = microMacroMap[in_code];
00506 else
00507 out_code = lmtable::getDict()->oovcode();
00508
00509 out.pushc(out_code);
00510 }
00511 VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) out = " << out << "\n");
00512 return;
00513 }
00514
00515
00516
00517
00518
00519
00520
00521 const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size)
00522 {
00523 ngram macro_ng(lmtable::getDict());
00524
00525 if (micro_ng.dict == macro_ng.dict)
00526 macro_ng.trans(micro_ng);
00527 else
00528 map(µ_ng, ¯o_ng);
00529
00530 VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
00531 << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
00532
00533 return lmtable::maxsuffptr(macro_ng,size);
00534 }
00535
00536 const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
00537 {
00538
00539
00540
00541
00542
00543
00544
00545
00546 ngram macro_ng(lmtable::getDict());
00547
00548 if (micro_ng.dict == macro_ng.dict)
00549 macro_ng.trans(micro_ng);
00550 else
00551 map(µ_ng, ¯o_ng);
00552
00553 VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
00554 << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
00555
00556 return lmtable::cmaxsuffptr(macro_ng,size);
00557
00558 }
00559
00560
00561 void lmmacro::map(ngram *in, ngram *out)
00562 {
00563
00564 VERBOSE(2,"In lmmacro::map, in = " << *in << endl
00565 << " (selectedField = " << selectedField << " )\n");
00566
00567 if (selectedField==-2)
00568 One2OneMapping(in, out);
00569
00570 else if (selectedField==-1)
00571 Micro2MacroMapping(in, out);
00572
00573 else if (selectedField<10) {
00574 ngram field_ng(((lmmacro *)this)->getDict());
00575 int microsize = in->size;
00576
00577 for (int i=microsize; i>0; i--) {
00578
00579 char curr_token[BUFSIZ];
00580 strcpy(curr_token, ((lmmacro *)this)->getDict()->decode(*(in->wordp(i))));
00581 char *field;
00582 if (strcmp(curr_token,"<s>") &&
00583 strcmp(curr_token,"</s>") &&
00584 strcmp(curr_token,"_unk_")) {
00585 field = strtok(curr_token, "#");
00586 int j=0;
00587 while (j<selectedField && field != NULL) {
00588 field = strtok(0, "#");
00589 j++;
00590 }
00591 } else {
00592 field = curr_token;
00593 }
00594
00595 if (field)
00596 field_ng.pushw(field);
00597 else {
00598
00599 field_ng.pushw((char*)"_unk_");
00600
00601
00602
00603
00618 }
00619 }
00620 if (microMacroMapN>0)
00621 Micro2MacroMapping(&field_ng, out);
00622 else
00623 out->trans(field_ng);
00624 } else {
00625
00626 #ifdef DLEXICALLM
00627
00628
00629
00630
00631 int tagIdx = selectedField/10;
00632 int lemmaIdx = selectedField%10;
00633
00634
00635 ngram tag_ng(getDict());
00636 char *lemmas[BUFSIZ];
00637
00638 int microsize = in->size;
00639 for (int i=microsize; i>0; i--) {
00640 char curr_token[BUFSIZ];
00641 strcpy(curr_token, getDict()->decode(*(in->wordp(i))));
00642 char *tag = NULL, *lemma = NULL;
00643
00644 if (strcmp(curr_token,"<s>") &&
00645 strcmp(curr_token,"</s>") &&
00646 strcmp(curr_token,"_unk_")) {
00647
00648 if (tagIdx<lemmaIdx) {
00649 tag = strtok(curr_token, "#");
00650 for (int j=0; j<tagIdx; j++)
00651 tag = strtok(0, "#");
00652 for (int j=tagIdx; j<lemmaIdx; j++)
00653 lemma = strtok(0, "#");
00654 } else {
00655 lemma = strtok(curr_token, "#");
00656 for (int j=0; j<lemmaIdx; j++)
00657 lemma = strtok(0, "#");
00658 for (int j=lemmaIdx; j<tagIdx; j++)
00659 tag = strtok(0, "#");
00660 }
00661
00662 VERBOSE(3,"(tag,lemma) = " << tag << " " << lemma << "\n");
00663 } else {
00664 tag = curr_token;
00665 lemma = curr_token;
00666 VERBOSE(3,"(tag=lemma) = " << tag << " " << lemma << "\n");
00667 }
00668 if (tag) {
00669 tag_ng.pushw(tag);
00670 lemmas[i] = strdup(lemma);
00671 } else {
00672 tag_ng.pushw((char*)"_unk_");
00673 lemmas[i] = strdup("_unk_");
00674 }
00675 }
00676
00677 if (microMacroMapN>0)
00678 Micro2MacroMapping(&tag_ng, out, lemmas);
00679 else
00680 out->trans(tag_ng);
00681
00682 #endif
00683
00684 }
00685
00686 VERBOSE(2,"In lmmacro::map, FINAL out = " << *out << endl);
00687 }
00688
00689 void lmmacro::One2OneMapping(ngram *in, ngram *out)
00690 {
00691 int insize = in->size;
00692
00693
00694
00695 for (int i=insize; i>0; i--) {
00696
00697 int curr_code = *(in->wordp(i));
00698 const char *outtoken =
00699 lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
00700 out->pushw(outtoken);
00701 }
00702 return;
00703 }
00704
00705
00706 void lmmacro::Micro2MacroMapping(ngram *in, ngram *out)
00707 {
00708
00709 int microsize = in->size;
00710
00711 VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n");
00712
00713
00714
00715 for (int i=microsize; i>0; i--) {
00716
00717 int curr_code = *(in->wordp(i));
00718 const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
00719
00720 if (i==microsize) {
00721 out->pushw(curr_macrotag);
00722
00723 } else {
00724 int prev_code = *(in->wordp(i+1));
00725
00726 const char *prev_microtag = getDict()->decode(prev_code);
00727 const char *curr_microtag = getDict()->decode(curr_code);
00728 const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
00729
00730
00731 int prev_len = strlen(prev_microtag)-1;
00732 int curr_len = strlen(curr_microtag)-1;
00733
00734 if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
00735 !(
00736 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) ||
00737 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) ||
00738 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
00739 (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))))
00740 out->pushw(curr_macrotag);
00741 }
00742 }
00743 return;
00744 }
00745
00746
00747
00748
00749
00750 #ifdef DLEXICALLM
00751
00752 void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas)
00753 {
00754 VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n")
00755
00756 int microsize = in->size;
00757
00758 IFVERBOSE(3) {
00759 VERBOSE(3,"In Micro2MacroMapping, lemmas:\n");
00760 if (lexicaltoken2classMap)
00761 for (int i=microsize; i>0; i--)
00762 VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl);
00763 else
00764 for (int i=microsize; i>0; i--)
00765 VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << endl);
00766 }
00767
00768
00769
00770 char tag_lemma[BUFSIZ];
00771
00772 for (int i=microsize; i>0; i--) {
00773
00774 int curr_code = *(in->wordp(i));
00775
00776 const char *curr_microtag = getDict()->decode(curr_code);
00777 const char *curr_lemma = lemmas[i];
00778 const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
00779 int curr_len = strlen(curr_microtag)-1;
00780
00781 if (i==microsize) {
00782 if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
00783 sprintf(tag_lemma, "%s", curr_macrotag);
00784 else if (lexicaltoken2classMap)
00785 sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
00786 else
00787 sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]);
00788
00789 VERBOSE(2,"In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n");
00790
00791 out->pushw(tag_lemma);
00792 free(lemmas[microsize]);
00793
00794
00795 } else {
00796
00797 int prev_code = *(in->wordp(i+1));
00798 const char *prev_microtag = getDict()->decode(prev_code);
00799 const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
00800
00801
00802 int prev_len = strlen(prev_microtag)-1;
00803
00804 if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
00805 sprintf(tag_lemma, "%s", curr_macrotag);
00806 else if (lexicaltoken2classMap)
00807 sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
00808 else
00809 sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma);
00810
00811 VERBOSE(2,"In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n");
00812
00813 if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
00814 !(
00815 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') ||
00816 (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) ||
00817 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
00818 (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) {
00819
00820 VERBOSE(2,"In Micro2MacroMapping, before pushw, out = " << *out << endl);
00821 out->pushw(tag_lemma);
00822 VERBOSE(2,"In Micro2MacroMapping, after pushw, out = " << *out << endl);
00823 } else {
00824 VERBOSE(2,"In Micro2MacroMapping, before shift, out = " << *out << endl);
00825 out->shift();
00826 VERBOSE(2,"In Micro2MacroMapping, after shift, out = " << *out << endl);
00827 out->pushw(tag_lemma);
00828 VERBOSE(2,"In Micro2MacroMapping, after push, out = " << *out << endl);
00829 }
00830 free(lemmas[i]);
00831 }
00832 }
00833 return;
00834 }
00835
00836 void lmmacro::loadLexicalClasses(const char *fn)
00837 {
00838 char line[MAX_LINE];
00839 const char* words[MAX_TOKEN_N_MAP];
00840 int tokenN;
00841
00842 lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int));
00843 lexicaltoken2classMapN = BUFSIZ;
00844
00845 lmtable::getDict()->incflag(1);
00846
00847 inputfilestream inp(fn);
00848 while (inp.getline(line,MAX_LINE,'\n')) {
00849 tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
00850 if (tokenN != 2)
00851 error((char*)"ERROR: wrong format of lexical classes file\n");
00852 else {
00853 int classIdx = atoi(words[1]);
00854 int wordCode = lmtable::getDict()->encode(words[0]);
00855
00856 if (wordCode>=lexicaltoken2classMapN) {
00857 int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ;
00858 lexicaltoken2classMapN += (r+1)*BUFSIZ;
00859 lexicaltoken2classMap = (int *)realloc(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN);
00860 }
00861 lexicaltoken2classMap[wordCode] = classIdx;
00862 }
00863 }
00864
00865 lmtable::getDict()->incflag(0);
00866
00867 IFVERBOSE(3) {
00868 for (int x=0; x<lmtable::getDict()->size(); x++)
00869 VERBOSE(3,"class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl);
00870 }
00871
00872 return;
00873 }
00874
00875
00876 void lmmacro::cutLex(ngram *in, ngram *out)
00877 {
00878 *out=*in;
00879
00880 const char *curr_macro = out->dict->decode(*(out->wordp(1)));
00881 out->shift();
00882 const char *p = strrchr(curr_macro, '_');
00883 int lexLen;
00884 if (p)
00885 lexLen=strlen(p);
00886 else
00887 lexLen=0;
00888 char curr_NoLexMacro[BUFSIZ];
00889 memset(&curr_NoLexMacro,0,BUFSIZ);
00890 strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen);
00891 out->pushw(curr_NoLexMacro);
00892 return;
00893 }
00894 #endif
00895