00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <cstdlib>
00024 #include <stdlib.h>
00025 #include <iostream>
00026 #include <stdexcept>
00027 #include <string>
00028 #include <cassert>
00029 #include "lmContainer.h"
00030 #include "lmInterpolation.h"
00031
00032 using namespace std;
00033
00034 inline void error(const char* message)
00035 {
00036 std::cerr << message << "\n";
00037 throw std::runtime_error(message);
00038 }
00039
00040 lmInterpolation::lmInterpolation(float nlf, float dlf)
00041 {
00042 ngramcache_load_factor = nlf;
00043 dictionary_load_factor = dlf;
00044
00045 order=0;
00046 memmap=0;
00047 isInverted=false;
00048 }
00049
00050 void lmInterpolation::load(const std::string filename,int mmap)
00051 {
00052 VERBOSE(2,"lmInterpolation::load(const std::string filename,int memmap)" << std::endl);
00053 VERBOSE(2," filename:|" << filename << "|" << std::endl);
00054
00055
00056 dictionary_upperbound=1000000;
00057 int memmap=mmap;
00058
00059
00060 dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
00061
00062
00063 fstream inp(filename.c_str(),ios::in|ios::binary);
00064
00065 char line[MAX_LINE];
00066 const char* words[MAX_TOKEN];
00067 int tokenN;
00068 inp.getline(line,MAX_LINE,'\n');
00069 tokenN = parseWords(line,words,MAX_TOKEN);
00070
00071 if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0)))
00072 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
00073
00074 m_number_lm = atoi(words[1]);
00075
00076 m_weight.resize(m_number_lm);
00077 m_file.resize(m_number_lm);
00078 m_isinverted.resize(m_number_lm);
00079 m_lm.resize(m_number_lm);
00080
00081 VERBOSE(2,"lmInterpolation::load(const std::string filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
00082
00083 dict->incflag(1);
00084 for (int i=0; i<m_number_lm; i++) {
00085 inp.getline(line,BUFSIZ,'\n');
00086 tokenN = parseWords(line,words,3);
00087
00088 if(tokenN < 2 || tokenN >3) {
00089 error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
00090 }
00091
00092
00093 m_isinverted[i] = false;
00094 if(tokenN == 3) {
00095 if (strcmp(words[2],"inverted") == 0)
00096 m_isinverted[i] = true;
00097 }
00098 VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
00099
00100 m_weight[i] = (float) atof(words[0]);
00101 m_file[i] = words[1];
00102 VERBOSE(2,"lmInterpolation::load(const std::string filename,int mmap) m_file:"<< words[1] << std::endl;);
00103
00104 m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
00105
00106 m_isinverted[i] = m_lm[i]->is_inverted();
00107
00108 dictionary *_dict=m_lm[i]->getDict();
00109 for (int j=0; j<_dict->size(); j++) {
00110 dict->encode(_dict->decode(j));
00111 }
00112 }
00113 getDict()->genoovcode();
00114
00115 getDict()->incflag(1);
00116 inp.close();
00117
00118 int maxorder = 0;
00119 for (int i=0; i<m_number_lm; i++) {
00120 maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
00121 }
00122
00123 if (order == 0) {
00124 order = maxorder;
00125 std::cerr << "order is not set; reset to the maximum order of LMs: " << order << std::endl;
00126 } else if (order > maxorder) {
00127 order = maxorder;
00128 std::cerr << "order is too high; reset to the maximum order of LMs: " << order << std::endl;
00129 }
00130 maxlev=order;
00131 }
00132
00133 lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
00134 {
00135
00136
00137 lmContainer* lmt=NULL;
00138
00139 lmt = lmt->CreateLanguageModel(m_file[i],nlf,dlf);
00140
00141
00142 lmt->is_inverted(m_isinverted[i]);
00143
00144 lmt->setMaxLoadedLevel(requiredMaxlev);
00145
00146 lmt->load(m_file[i], memmap);
00147
00148 lmt->init_caches(lmt->maxlevel());
00149 return lmt;
00150 }
00151
00152
00153 double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
00154 {
00155
00156 double pr=0.0;
00157 double _logpr;
00158
00159 char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
00160 unsigned int _statesize=0,actualstatesize=0;
00161 int _bol=0,actualbol=MAX_NGRAM;
00162 double _bow=0.0,actualbow=0.0;
00163
00164 bool* _extendible=NULL,actualextendible=false;
00165
00166 if (extendible) {
00167 _extendible=new bool;
00168 _extendible=false;
00169 }
00170
00171 for (size_t i=0; i<m_lm.size(); i++) {
00172
00173 ngram _ng(m_lm[i]->getDict());
00174 _ng.trans(ng);
00175
00176 _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,_extendible);
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197 pr+=m_weight[i]*pow(10.0,_logpr);
00198 actualbow+=m_weight[i]*pow(10.0,_bow);
00199
00200 if(_statesize > actualstatesize || i == 0) {
00201 actualmaxsuffptr = _maxsuffptr;
00202 actualstatesize = _statesize;
00203 }
00204 if (_bol < actualbol) {
00205 actualbol=_bol;
00206 }
00207 if (_extendible) {
00208 actualextendible=true;
00209 }
00210 }
00211 if (bol) *bol=actualbol;
00212 if (bow) *bow=log(actualbow);
00213 if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
00214 if (statesize) *statesize=actualstatesize;
00215 if (extendible) {
00216 *extendible=actualextendible;
00217 delete _extendible;
00218 }
00219
00220
00221
00222
00223
00224
00225 return log(pr)/M_LN10;
00226 }
00227
00228 double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
00229 {
00230
00231
00232 ngram ong(dict);
00233 ong.pushc(codes,sz);
00234 assert (ong.size == sz);
00235
00236 return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
00237 }
00238
00239 double lmInterpolation::setlogOOVpenalty(int dub)
00240 {
00241 assert(dub > dict->size());
00242 double _logpr;
00243 double OOVpenalty=0.0;
00244 for (int i=0; i<m_number_lm; i++) {
00245 m_lm[i]->setlogOOVpenalty(dub);
00246 _logpr=m_lm[i]->getlogOOVpenalty();
00247 OOVpenalty+=m_weight[i]*exp(_logpr);
00248 }
00249 logOOVpenalty=log(OOVpenalty);
00250 return logOOVpenalty;
00251 }
00252