00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "moses/TranslationModel/PhraseDictionaryGroup.h"
00021
00022 #include <boost/foreach.hpp>
00023 #include <boost/unordered_map.hpp>
00024
00025 #include "util/exception.hh"
00026
00027 using namespace std;
00028 using namespace boost;
00029
00030 namespace Moses
00031 {
00032
00033 PhraseDictionaryGroup::PhraseDictionaryGroup(const string &line)
00034 : PhraseDictionary(line, true),
00035 m_numModels(0),
00036 m_totalModelScores(0),
00037 m_phraseCounts(false),
00038 m_wordCounts(false),
00039 m_modelBitmapCounts(false),
00040 m_restrict(false),
00041 m_haveDefaultScores(false),
00042 m_defaultAverageOthers(false),
00043 m_scoresPerModel(0),
00044 m_haveMmsaptLrFunc(false)
00045 {
00046 ReadParameters();
00047 }
00048
00049 void PhraseDictionaryGroup::SetParameter(const string& key, const string& value)
00050 {
00051 if (key == "members") {
00052 m_memberPDStrs = Tokenize(value, ",");
00053 m_numModels = m_memberPDStrs.size();
00054 m_seenByAll = dynamic_bitset<>(m_numModels);
00055 m_seenByAll.set();
00056 } else if (key == "restrict") {
00057 m_restrict = Scan<bool>(value);
00058 } else if (key == "phrase-counts") {
00059 m_phraseCounts = Scan<bool>(value);
00060 } else if (key == "word-counts") {
00061 m_wordCounts = Scan<bool>(value);
00062 } else if (key == "model-bitmap-counts") {
00063 m_modelBitmapCounts = Scan<bool>(value);
00064 } else if (key =="default-scores") {
00065 m_haveDefaultScores = true;
00066 m_defaultScores = Scan<float>(Tokenize(value, ","));
00067 } else if (key =="default-average-others") {
00068 m_defaultAverageOthers = Scan<bool>(value);
00069 } else if (key =="mmsapt-lr-func") {
00070 m_haveMmsaptLrFunc = true;
00071 } else {
00072 PhraseDictionary::SetParameter(key, value);
00073 }
00074 }
00075
00076 void PhraseDictionaryGroup::Load(AllOptions::ptr const& opts)
00077 {
00078 m_options = opts;
00079 SetFeaturesToApply();
00080 m_pdFeature.push_back(const_cast<PhraseDictionaryGroup*>(this));
00081 size_t numScoreComponents = 0;
00082
00083
00084 BOOST_FOREACH(const string& pdName, m_memberPDStrs) {
00085 bool pdFound = false;
00086 BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl()) {
00087 if (pd->GetScoreProducerDescription() == pdName) {
00088 pdFound = true;
00089 m_memberPDs.push_back(pd);
00090 size_t nScores = pd->GetNumScoreComponents();
00091 numScoreComponents += nScores;
00092 if (m_scoresPerModel == 0) {
00093 m_scoresPerModel = nScores;
00094 } else if (m_defaultAverageOthers) {
00095 UTIL_THROW_IF2(nScores != m_scoresPerModel,
00096 m_description << ": member models must have the same number of scores when using default-average-others");
00097 }
00098 }
00099 }
00100 UTIL_THROW_IF2(!pdFound,
00101 m_description << ": could not find member phrase table " << pdName);
00102 }
00103 m_totalModelScores = numScoreComponents;
00104
00105
00106 if (m_phraseCounts) {
00107 numScoreComponents += m_numModels;
00108 }
00109 if (m_wordCounts) {
00110 numScoreComponents += m_numModels;
00111 }
00112 if (m_modelBitmapCounts) {
00113 numScoreComponents += (pow(2, m_numModels) - 1);
00114 }
00115 UTIL_THROW_IF2(numScoreComponents != m_numScoreComponents,
00116 m_description << ": feature count mismatch: specify \"num-features=" << numScoreComponents << "\" and supply " << numScoreComponents << " weights");
00117
00118 #ifdef PT_UG
00119
00120 if (m_haveMmsaptLrFunc) {
00121 BOOST_FOREACH(PhraseDictionary* pd, m_memberPDs) {
00122
00123
00124 m_mmsaptLrFuncs.push_back(&(static_cast<Mmsapt*>(pd)->m_lr_func));
00125 }
00126 }
00127 #endif
00128
00129
00130 if (m_haveDefaultScores) {
00131 UTIL_THROW_IF2(m_defaultScores.size() != m_numScoreComponents,
00132 m_description << ": number of specified default scores is unequal to number of member model scores");
00133 } else {
00134
00135
00136 m_defaultScores = vector<float>(m_numScoreComponents, 0);
00137 }
00138 }
00139
00140 void PhraseDictionaryGroup::InitializeForInput(const ttasksptr& ttask)
00141 {
00142
00143 }
00144
00145 void PhraseDictionaryGroup::GetTargetPhraseCollectionBatch(
00146 const ttasksptr& ttask, const InputPathList& inputPathQueue) const
00147 {
00148
00149 BOOST_FOREACH(const InputPath* inputPath, inputPathQueue) {
00150 const Phrase& phrase = inputPath->GetPhrase();
00151 BOOST_FOREACH(const PhraseDictionary* pd, m_memberPDs) {
00152 pd->PrefixExists(ttask, phrase);
00153 }
00154 }
00155
00156 BOOST_FOREACH(InputPath* inputPath, inputPathQueue) {
00157 const Phrase &phrase = inputPath->GetPhrase();
00158 TargetPhraseCollection::shared_ptr targetPhrases =
00159 this->GetTargetPhraseCollectionLEGACY(ttask, phrase);
00160 inputPath->SetTargetPhrases(*this, targetPhrases, NULL);
00161 }
00162 }
00163
00164 TargetPhraseCollection::shared_ptr PhraseDictionaryGroup::GetTargetPhraseCollectionLEGACY(
00165 const Phrase& src) const
00166 {
00167 UTIL_THROW2("Don't call me without the translation task.");
00168 }
00169
00170 TargetPhraseCollection::shared_ptr
00171 PhraseDictionaryGroup::
00172 GetTargetPhraseCollectionLEGACY(const ttasksptr& ttask, const Phrase& src) const
00173 {
00174 TargetPhraseCollection::shared_ptr ret
00175 = CreateTargetPhraseCollection(ttask, src);
00176 ret->NthElement(m_tableLimit);
00177 const_cast<PhraseDictionaryGroup*>(this)->CacheForCleanup(ret);
00178 return ret;
00179 }
00180
00181 TargetPhraseCollection::shared_ptr
00182 PhraseDictionaryGroup::
00183 CreateTargetPhraseCollection(const ttasksptr& ttask, const Phrase& src) const
00184 {
00185
00186 vector<TargetPhrase*> phraseList;
00187 typedef unordered_map<const TargetPhrase*, PDGroupPhrase, UnorderedComparer<Phrase>, UnorderedComparer<Phrase> > PhraseMap;
00188 PhraseMap phraseMap;
00189
00190
00191 size_t offset = 0;
00192 for (size_t i = 0; i < m_numModels; ++i) {
00193
00194
00195 const PhraseDictionary& pd = *m_memberPDs[i];
00196 TargetPhraseCollection::shared_ptr
00197 ret_raw = pd.GetTargetPhraseCollectionLEGACY(ttask, src);
00198
00199 if (ret_raw != NULL) {
00200
00201 BOOST_FOREACH(const TargetPhrase* targetPhrase, *ret_raw) {
00202 vector<float> raw_scores =
00203 targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd);
00204
00205
00206 PhraseMap::iterator iter = phraseMap.find(targetPhrase);
00207 if (iter == phraseMap.end()) {
00208 if (m_restrict && i > 0) {
00209 continue;
00210 }
00211
00212
00213 TargetPhrase* phrase = new TargetPhrase(*targetPhrase);
00214
00215 phrase->GetScoreBreakdown().InvertDenseFeatures(&pd);
00216 vector<FeatureFunction*> pd_feature;
00217 pd_feature.push_back(m_memberPDs[i]);
00218 const vector<FeatureFunction*> pd_feature_const(pd_feature);
00219 phrase->EvaluateInIsolation(src, pd_feature_const);
00220
00221 phrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
00222
00223 phraseList.push_back(phrase);
00224 phraseMap[targetPhrase] = PDGroupPhrase(phrase, m_defaultScores, m_numModels);
00225 } else {
00226
00227 TargetPhrase* phrase = iter->second.m_targetPhrase;
00228 BOOST_FOREACH(const TargetPhrase::ScoreCache_t::value_type pair, targetPhrase->GetExtraScores()) {
00229 phrase->SetExtraScores(pair.first, pair.second);
00230 }
00231 }
00232
00233 PDGroupPhrase& pdgPhrase = (iter == phraseMap.end()) ? phraseMap.find(targetPhrase)->second : iter->second;
00234
00235
00236 for (size_t j = 0; j < pd.GetNumScoreComponents(); ++j) {
00237 pdgPhrase.m_scores[offset + j] = raw_scores[j];
00238 }
00239
00240
00241 pdgPhrase.m_seenBy[i] = true;
00242 }
00243 }
00244 offset += pd.GetNumScoreComponents();
00245 }
00246
00247
00248 TargetPhraseCollection::shared_ptr ret(new TargetPhraseCollection);
00249 const vector<FeatureFunction*> pd_feature_const(m_pdFeature);
00250 BOOST_FOREACH(TargetPhrase* phrase, phraseList) {
00251 PDGroupPhrase& pdgPhrase = phraseMap.find(phrase)->second;
00252
00253
00254
00255
00256
00257 size_t offset = m_totalModelScores;
00258
00259 if (m_phraseCounts) {
00260 for (size_t i = 0; i < m_numModels; ++i) {
00261 if (pdgPhrase.m_seenBy[i]) {
00262 pdgPhrase.m_scores[offset + i] = 1;
00263 }
00264 }
00265 offset += m_numModels;
00266 }
00267
00268 if (m_wordCounts) {
00269 size_t wc = pdgPhrase.m_targetPhrase->GetSize();
00270 for (size_t i = 0; i < m_numModels; ++i) {
00271 if (pdgPhrase.m_seenBy[i]) {
00272 pdgPhrase.m_scores[offset + i] = wc;
00273 }
00274 }
00275 offset += m_numModels;
00276 }
00277
00278
00279
00280 if (m_modelBitmapCounts) {
00281
00282 pdgPhrase.m_scores[offset + (pdgPhrase.m_seenBy.to_ulong() - 1)] = 1;
00283 offset += m_seenByAll.to_ulong();
00284 }
00285
00286
00287
00288 if (m_defaultAverageOthers) {
00289
00290 if (pdgPhrase.m_seenBy != m_seenByAll) {
00291 vector<float> avgScores(m_scoresPerModel, 0);
00292 size_t seenBy = 0;
00293 offset = 0;
00294
00295 for (size_t i = 0; i < m_numModels; ++i) {
00296 if (pdgPhrase.m_seenBy[i]) {
00297 for (size_t j = 0; j < m_scoresPerModel; ++j) {
00298 avgScores[j] += pdgPhrase.m_scores[offset + j];
00299 }
00300 seenBy += 1;
00301 }
00302 offset += m_scoresPerModel;
00303 }
00304
00305 for (size_t j = 0; j < m_scoresPerModel; ++j) {
00306 avgScores[j] /= seenBy;
00307 }
00308
00309 offset = 0;
00310 for (size_t i = 0; i < m_numModels; ++i) {
00311 if (!pdgPhrase.m_seenBy[i]) {
00312 for (size_t j = 0; j < m_scoresPerModel; ++j) {
00313 pdgPhrase.m_scores[offset + j] = avgScores[j];
00314 }
00315 }
00316 offset += m_scoresPerModel;
00317 }
00318 #ifdef PT_UG
00319
00320
00321 if (m_haveMmsaptLrFunc) {
00322 SPTR<Scores> avgLRScores;
00323 size_t seenBy = 0;
00324
00325 for (size_t i = 0; i < m_numModels; ++i) {
00326 const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i];
00327
00328 if (pdgPhrase.m_seenBy[i] && lrFunc != NULL) {
00329 const Scores* scores = pdgPhrase.m_targetPhrase->GetExtraScores(lrFunc);
00330 if (!avgLRScores) {
00331 avgLRScores.reset(new Scores(*scores));
00332 } else {
00333 for (size_t j = 0; j < scores->size(); ++j) {
00334 (*avgLRScores)[j] += (*scores)[j];
00335 }
00336 }
00337 seenBy += 1;
00338 }
00339 }
00340
00341 if (avgLRScores) {
00342
00343 for (size_t j = 0; j < avgLRScores->size(); ++j) {
00344 (*avgLRScores)[j] /= seenBy;
00345 }
00346
00347 for (size_t i = 0; i < m_numModels; ++i) {
00348 const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i];
00349 if (!pdgPhrase.m_seenBy[i] && lrFunc != NULL) {
00350 pdgPhrase.m_targetPhrase->SetExtraScores(lrFunc, avgLRScores);
00351 }
00352 }
00353 }
00354 }
00355 #endif
00356 }
00357 }
00358
00359
00360 phrase->GetScoreBreakdown().Assign(this, pdgPhrase.m_scores);
00361
00362 phrase->EvaluateInIsolation(src, pd_feature_const);
00363 ret->Add(phrase);
00364 }
00365
00366 return ret;
00367 }
00368
00369 ChartRuleLookupManager*
00370 PhraseDictionaryGroup::
00371 CreateRuleLookupManager(const ChartParser &,
00372 const ChartCellCollectionBase&, size_t)
00373 {
00374 UTIL_THROW(util::Exception, "Phrase table used in chart decoder");
00375 }
00376
00377
00378 void PhraseDictionaryGroup::CacheForCleanup(TargetPhraseCollection::shared_ptr tpc)
00379 {
00380 PhraseCache &ref = GetPhraseCache();
00381 ref.push_back(tpc);
00382 }
00383
00384 void
00385 PhraseDictionaryGroup::
00386 CleanUpAfterSentenceProcessing(const InputType &source)
00387 {
00388 GetPhraseCache().clear();
00389 CleanUpComponentModels(source);
00390 }
00391
00392 void PhraseDictionaryGroup::CleanUpComponentModels(const InputType &source)
00393 {
00394 for (size_t i = 0; i < m_numModels; ++i) {
00395 m_memberPDs[i]->CleanUpAfterSentenceProcessing(source);
00396 }
00397 }
00398
00399 }