00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "ParallelBackoff.h"
00023
00024 #include <vector>
00025 #include <string>
00026 #include <sstream>
00027 #include <fstream>
00028
00029 #include "MultiFactor.h"
00030 #include "moses/Word.h"
00031 #include "moses/Factor.h"
00032 #include "moses/FactorTypeSet.h"
00033 #include "moses/FactorCollection.h"
00034 #include "moses/Phrase.h"
00035 #include "moses/TypeDef.h"
00036 #include "moses/Util.h"
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049 #ifdef __APPLE__
00050 #define HAVE_ZOPEN
00051 #endif
00052
00053 #include "FNgramSpecs.h"
00054 #include "FNgramStats.h"
00055 #include "FactoredVocab.h"
00056 #include "FNgram.h"
00057 #include "wmatrix.h"
00058 #include "Vocab.h"
00059 #include "File.h"
00060
00061 using namespace std;
00062
00063 namespace Moses
00064 {
00065
00066 namespace
00067 {
00068 class LanguageModelParallelBackoff : public LanguageModelMultiFactor
00069 {
00070 private:
00071 std::vector<FactorType> m_factorTypesOrdered;
00072
00073 FactoredVocab *m_srilmVocab;
00074 FNgram *m_srilmModel;
00075 VocabIndex m_unknownId;
00076 VocabIndex m_wtid;
00077 VocabIndex m_wtbid;
00078 VocabIndex m_wteid;
00079 FNgramSpecs<FNgramCount>* fnSpecs;
00080
00081 std::map<size_t, VocabIndex>* lmIdMap;
00082 std::fstream* debugStream;
00083
00084 WidMatrix *widMatrix;
00085
00086 public:
00087 LanguageModelParallelBackoff(const std::string &line)
00088 :LanguageModelMultiFactor(line) {
00089 }
00090
00091 ~LanguageModelParallelBackoff();
00092
00093 bool Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder);
00094
00095 VocabIndex GetLmID( const std::string &str ) const;
00096
00097 VocabIndex GetLmID( const Factor *factor, FactorType ft ) const;
00098
00099 void CreateFactors();
00100
00101 LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
00102 const FFState *GetNullContextState() const;
00103 const FFState *GetBeginSentenceState() const;
00104 FFState *NewState(const FFState *from) const;
00105 };
00106
00107 LanguageModelParallelBackoff::~LanguageModelParallelBackoff()
00108 {
00110 }
00111
00112
00113 bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder)
00114 {
00115
00116 cerr << "Loading Language Model Parallel Backoff!!!\n";
00117 widMatrix = new ::WidMatrix();
00118 m_factorTypes = FactorMask(factorTypes);
00119 m_srilmVocab = new ::FactoredVocab();
00120
00121
00122 fnSpecs = 0;
00123 File f(filePath.c_str(),"r");
00124 fnSpecs = new ::FNgramSpecs<FNgramCount>(f,*m_srilmVocab, 0);
00125
00126 cerr << "Loaded fnSpecs!\n";
00127
00128 m_srilmVocab->unkIsWord() = true;
00129 m_srilmVocab->nullIsWord() = true;
00130 m_srilmVocab->toLower() = false;
00131
00132 FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs);
00133
00134 factoredStats->debugme(2);
00135
00136 cerr << "Factored stats\n";
00137
00138 FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs);
00139
00140 cerr << "FNgram object created\n";
00141
00142 fngramLM->skipOOVs = false;
00143
00144 if (!factoredStats->read()) {
00145 cerr << "error reading in counts in factor file\n";
00146 exit(1);
00147 }
00148
00149 cerr << "Factored stats read!\n";
00150
00151 factoredStats->estimateDiscounts();
00152 factoredStats->computeCardinalityFunctions();
00153 factoredStats->sumCounts();
00154
00155 cerr << "Another three operations made!\n";
00156
00157 if (!fngramLM->read()) {
00158 cerr << "format error in lm file\n";
00159 exit(1);
00160 }
00161
00162 cerr << "fngramLM reads!\n";
00163
00164 m_filePath = filePath;
00165 m_nGramOrder= nGramOrder;
00166
00167 m_factorTypesOrdered= factorTypes;
00168
00169 m_unknownId = m_srilmVocab->unkIndex();
00170
00171 cerr << "m_unknowdId = " << m_unknownId << endl;
00172
00173 m_srilmModel = fngramLM;
00174
00175 cerr << "Create factors...\n";
00176
00177 CreateFactors();
00178
00179 cerr << "Factors created! \n";
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203 return true;
00204 }
00205
00206 VocabIndex LanguageModelParallelBackoff::GetLmID( const std::string &str ) const
00207 {
00208 return m_srilmVocab->getIndex( str.c_str(), m_unknownId );
00209 }
00210
00211 VocabIndex LanguageModelParallelBackoff::GetLmID( const Factor *factor, size_t ft ) const
00212 {
00213
00214 size_t factorId = factor->GetId();
00215 if ( lmIdMap->find( factorId * 10 + ft ) != lmIdMap->end() ) {
00216 return lmIdMap->find( factorId * 10 + ft )->second;
00217 } else {
00218 return m_unknownId;
00219 }
00220
00221 }
00222
00223 void LanguageModelParallelBackoff::CreateFactors()
00224 {
00225
00226
00227 FactorCollection &factorCollection = FactorCollection::Instance();
00228
00229 lmIdMap = new std::map<size_t, VocabIndex>();
00230
00231
00232 VocabString str;
00233 VocabIter iter(*m_srilmVocab);
00234
00235 iter.init();
00236
00237 size_t pomFactorTypeNum = 0;
00238
00239
00240 while ( (str = iter.next()) != NULL) {
00241
00242 if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') {
00243 continue;
00244 }
00245 VocabIndex lmId = GetLmID(str);
00246 pomFactorTypeNum = str[0] - 'a';
00247
00248 size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId();
00249 (*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId;
00250 }
00251
00252 size_t factorIdStart;
00253 size_t factorIdEnd;
00254
00255
00256 for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) {
00257 FactorType factorType = m_factorTypesOrdered[index];
00258 m_sentenceStartWord[index] = factorCollection.AddFactor(Output, factorType, BOS_);
00259
00260
00261 m_sentenceEndWord[index] = factorCollection.AddFactor(Output, factorType, EOS_);
00262
00263 factorIdStart = m_sentenceStartWord[index]->GetId();
00264 factorIdEnd = m_sentenceEndWord[index]->GetId();
00265
00266
00267
00268
00269
00270
00271
00272 (*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
00273 (*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);
00274
00275 cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl;
00276
00277 }
00278
00279 m_wtid = GetLmID("W-<unk>");
00280 m_wtbid = GetLmID("W-<s>");
00281 m_wteid = GetLmID("W-</s>");
00282
00283 cerr << "W-<unk> index: " << m_wtid << endl;
00284 cerr << "W-<s> index: " << m_wtbid << endl;
00285 cerr << "W-</s> index: " << m_wteid << endl;
00286
00287
00288 }
00289
00290 LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & ) const
00291 {
00292
00293 static WidMatrix widMatrix;
00294
00295 for (int i=0; i<contextFactor.size(); i++)
00296 ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex));
00297
00298
00299 for (size_t i = 0; i < contextFactor.size(); i++) {
00300 const Word &word = *contextFactor[i];
00301
00302 for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) {
00303 const Factor *factor = word[ m_factorTypesOrdered[j] ];
00304
00305 if (factor == NULL)
00306 widMatrix[i][j + 1] = 0;
00307 else
00308 widMatrix[i][j + 1] = GetLmID(factor, j);
00309 }
00310
00311 if (widMatrix[i][1] == GetLmID(m_sentenceStartWord[0], 0) ) {
00312 widMatrix[i][0] = m_wtbid;
00313 } else if (widMatrix[i][1] == GetLmID(m_sentenceEndWord[0], 0 )) {
00314 widMatrix[i][0] = m_wteid;
00315 } else {
00316 widMatrix[i][0] = m_wtid;
00317 }
00318 }
00319
00320
00321 LMResult ret;
00322 ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() );
00323 ret.score = FloorScore(TransformLMScore(ret.score));
00324 ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId);
00325 return ret;
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349 }
00350
00351
00352 FFState *LanguageModelParallelBackoff::NewState(const FFState * ) const
00353 {
00354 return NULL;
00355 }
00356
00357 const FFState *LanguageModelParallelBackoff::GetNullContextState() const
00358 {
00359 return NULL;
00360 }
00361
00362 const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
00363 {
00364 return NULL;
00365 }
00366
00367 }
00368
00369
00370 }
00371