Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/PhraseDictionaryGroup.cpp Source File

00001 /***********************************************************************
00002  Moses - factored phrase-based language decoder
00003  Copyright (C) 2006 University of Edinburgh
00004 
00005  This library is free software; you can redistribute it and/or
00006  modify it under the terms of the GNU Lesser General Public
00007  License as published by the Free Software Foundation; either
00008  version 2.1 of the License, or (at your option) any later version.
00009 
00010  This library is distributed in the hope that it will be useful,
00011  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  Lesser General Public License for more details.
00014 
00015  You should have received a copy of the GNU Lesser General Public
00016  License along with this library; if not, write to the Free Software
00017  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00018  ***********************************************************************/
00019 
00020 #include "moses/TranslationModel/PhraseDictionaryGroup.h"
00021 
00022 #include <boost/foreach.hpp>
00023 #include <boost/unordered_map.hpp>
00024 
00025 #include "util/exception.hh"
00026 
00027 using namespace std;
00028 using namespace boost;
00029 
00030 namespace Moses
00031 {
00032 
00033 PhraseDictionaryGroup::PhraseDictionaryGroup(const string &line)
00034   : PhraseDictionary(line, true),
00035     m_numModels(0),
00036     m_totalModelScores(0),
00037     m_phraseCounts(false),
00038     m_wordCounts(false),
00039     m_modelBitmapCounts(false),
00040     m_restrict(false),
00041     m_haveDefaultScores(false),
00042     m_defaultAverageOthers(false),
00043     m_scoresPerModel(0),
00044     m_haveMmsaptLrFunc(false)
00045 {
00046   ReadParameters();
00047 }
00048 
00049 void PhraseDictionaryGroup::SetParameter(const string& key, const string& value)
00050 {
00051   if (key == "members") {
00052     m_memberPDStrs = Tokenize(value, ",");
00053     m_numModels = m_memberPDStrs.size();
00054     m_seenByAll = dynamic_bitset<>(m_numModels);
00055     m_seenByAll.set();
00056   } else if (key == "restrict") {
00057     m_restrict = Scan<bool>(value);
00058   } else if (key == "phrase-counts") {
00059     m_phraseCounts = Scan<bool>(value);
00060   } else if (key == "word-counts") {
00061     m_wordCounts = Scan<bool>(value);
00062   } else if (key == "model-bitmap-counts") {
00063     m_modelBitmapCounts = Scan<bool>(value);
00064   } else if (key =="default-scores") {
00065     m_haveDefaultScores = true;
00066     m_defaultScores = Scan<float>(Tokenize(value, ","));
00067   } else if (key =="default-average-others") {
00068     m_defaultAverageOthers = Scan<bool>(value);
00069   } else if (key =="mmsapt-lr-func") {
00070     m_haveMmsaptLrFunc = true;
00071   } else {
00072     PhraseDictionary::SetParameter(key, value);
00073   }
00074 }
00075 
00076 void PhraseDictionaryGroup::Load(AllOptions::ptr const& opts)
00077 {
00078   m_options = opts;
00079   SetFeaturesToApply();
00080   m_pdFeature.push_back(const_cast<PhraseDictionaryGroup*>(this));
00081   size_t numScoreComponents = 0;
00082 
00083   // Locate/check component phrase tables
00084   BOOST_FOREACH(const string& pdName, m_memberPDStrs) {
00085     bool pdFound = false;
00086     BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl()) {
00087       if (pd->GetScoreProducerDescription() == pdName) {
00088         pdFound = true;
00089         m_memberPDs.push_back(pd);
00090         size_t nScores = pd->GetNumScoreComponents();
00091         numScoreComponents += nScores;
00092         if (m_scoresPerModel == 0) {
00093           m_scoresPerModel = nScores;
00094         } else if (m_defaultAverageOthers) {
00095           UTIL_THROW_IF2(nScores != m_scoresPerModel,
00096                          m_description << ": member models must have the same number of scores when using default-average-others");
00097         }
00098       }
00099     }
00100     UTIL_THROW_IF2(!pdFound,
00101                    m_description << ": could not find member phrase table " << pdName);
00102   }
00103   m_totalModelScores = numScoreComponents;
00104 
00105   // Check feature total
00106   if (m_phraseCounts) {
00107     numScoreComponents += m_numModels;
00108   }
00109   if (m_wordCounts) {
00110     numScoreComponents += m_numModels;
00111   }
00112   if (m_modelBitmapCounts) {
00113     numScoreComponents += (pow(2, m_numModels) - 1);
00114   }
00115   UTIL_THROW_IF2(numScoreComponents != m_numScoreComponents,
00116                  m_description << ": feature count mismatch: specify \"num-features=" << numScoreComponents << "\" and supply " << numScoreComponents << " weights");
00117 
00118 #ifdef PT_UG
00119   // Locate mmsapt lexical reordering functions if specified
00120   if (m_haveMmsaptLrFunc) {
00121     BOOST_FOREACH(PhraseDictionary* pd, m_memberPDs) {
00122       // pointer to pointer, all start as NULL and some may be populated prior
00123       // to translation
00124       m_mmsaptLrFuncs.push_back(&(static_cast<Mmsapt*>(pd)->m_lr_func));
00125     }
00126   }
00127 #endif
00128 
00129   // Determine "zero" scores for features
00130   if (m_haveDefaultScores) {
00131     UTIL_THROW_IF2(m_defaultScores.size() != m_numScoreComponents,
00132                    m_description << ": number of specified default scores is unequal to number of member model scores");
00133   } else {
00134     // Default is all 0 (as opposed to e.g. -99 or similar to approximate log(0)
00135     // or a smoothed "not in model" score)
00136     m_defaultScores = vector<float>(m_numScoreComponents, 0);
00137   }
00138 }
00139 
00140 void PhraseDictionaryGroup::InitializeForInput(const ttasksptr& ttask)
00141 {
00142   // Member models are registered as FFs and should already be initialized
00143 }
00144 
00145 void PhraseDictionaryGroup::GetTargetPhraseCollectionBatch(
00146   const ttasksptr& ttask, const InputPathList& inputPathQueue) const
00147 {
00148   // Some implementations (mmsapt) do work in PrefixExists
00149   BOOST_FOREACH(const InputPath* inputPath, inputPathQueue) {
00150     const Phrase& phrase = inputPath->GetPhrase();
00151     BOOST_FOREACH(const PhraseDictionary* pd, m_memberPDs) {
00152       pd->PrefixExists(ttask, phrase);
00153     }
00154   }
00155   // Look up each input in each model
00156   BOOST_FOREACH(InputPath* inputPath, inputPathQueue) {
00157     const Phrase &phrase = inputPath->GetPhrase();
00158     TargetPhraseCollection::shared_ptr  targetPhrases =
00159       this->GetTargetPhraseCollectionLEGACY(ttask, phrase);
00160     inputPath->SetTargetPhrases(*this, targetPhrases, NULL);
00161   }
00162 }
00163 
00164 TargetPhraseCollection::shared_ptr  PhraseDictionaryGroup::GetTargetPhraseCollectionLEGACY(
00165   const Phrase& src) const
00166 {
00167   UTIL_THROW2("Don't call me without the translation task.");
00168 }
00169 
00170 TargetPhraseCollection::shared_ptr
00171 PhraseDictionaryGroup::
00172 GetTargetPhraseCollectionLEGACY(const ttasksptr& ttask, const Phrase& src) const
00173 {
00174   TargetPhraseCollection::shared_ptr ret
00175   = CreateTargetPhraseCollection(ttask, src);
00176   ret->NthElement(m_tableLimit); // sort the phrases for pruning later
00177   const_cast<PhraseDictionaryGroup*>(this)->CacheForCleanup(ret);
00178   return ret;
00179 }
00180 
00181 TargetPhraseCollection::shared_ptr
00182 PhraseDictionaryGroup::
00183 CreateTargetPhraseCollection(const ttasksptr& ttask, const Phrase& src) const
00184 {
00185   // Aggregation of phrases and corresponding statistics (scores, models seen by)
00186   vector<TargetPhrase*> phraseList;
00187   typedef unordered_map<const TargetPhrase*, PDGroupPhrase, UnorderedComparer<Phrase>, UnorderedComparer<Phrase> > PhraseMap;
00188   PhraseMap phraseMap;
00189 
00190   // For each model
00191   size_t offset = 0;
00192   for (size_t i = 0; i < m_numModels; ++i) {
00193 
00194     // Collect phrases from this table
00195     const PhraseDictionary& pd = *m_memberPDs[i];
00196     TargetPhraseCollection::shared_ptr
00197     ret_raw = pd.GetTargetPhraseCollectionLEGACY(ttask, src);
00198 
00199     if (ret_raw != NULL) {
00200       // Process each phrase from table
00201       BOOST_FOREACH(const TargetPhrase* targetPhrase, *ret_raw) {
00202         vector<float> raw_scores =
00203           targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd);
00204 
00205         // Phrase not in collection -> add if unrestricted or first model
00206         PhraseMap::iterator iter = phraseMap.find(targetPhrase);
00207         if (iter == phraseMap.end()) {
00208           if (m_restrict && i > 0) {
00209             continue;
00210           }
00211 
00212           // Copy phrase to avoid disrupting base model
00213           TargetPhrase* phrase = new TargetPhrase(*targetPhrase);
00214           // Correct future cost estimates and total score
00215           phrase->GetScoreBreakdown().InvertDenseFeatures(&pd);
00216           vector<FeatureFunction*> pd_feature;
00217           pd_feature.push_back(m_memberPDs[i]);
00218           const vector<FeatureFunction*> pd_feature_const(pd_feature);
00219           phrase->EvaluateInIsolation(src, pd_feature_const);
00220           // Zero out scores from original phrase table
00221           phrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
00222           // Add phrase entry
00223           phraseList.push_back(phrase);
00224           phraseMap[targetPhrase] = PDGroupPhrase(phrase, m_defaultScores, m_numModels);
00225         } else {
00226           // For existing phrases: merge extra scores (such as lr-func scores for mmsapt)
00227           TargetPhrase* phrase = iter->second.m_targetPhrase;
00228           BOOST_FOREACH(const TargetPhrase::ScoreCache_t::value_type pair, targetPhrase->GetExtraScores()) {
00229             phrase->SetExtraScores(pair.first, pair.second);
00230           }
00231         }
00232         // Don't repeat lookup if phrase already found
00233         PDGroupPhrase& pdgPhrase = (iter == phraseMap.end()) ? phraseMap.find(targetPhrase)->second : iter->second;
00234 
00235         // Copy scores from this model
00236         for (size_t j = 0; j < pd.GetNumScoreComponents(); ++j) {
00237           pdgPhrase.m_scores[offset + j] = raw_scores[j];
00238         }
00239 
00240         // Phrase seen by this model
00241         pdgPhrase.m_seenBy[i] = true;
00242       }
00243     }
00244     offset += pd.GetNumScoreComponents();
00245   }
00246 
00247   // Compute additional scores as phrases are added to return collection
00248   TargetPhraseCollection::shared_ptr ret(new TargetPhraseCollection);
00249   const vector<FeatureFunction*> pd_feature_const(m_pdFeature);
00250   BOOST_FOREACH(TargetPhrase* phrase, phraseList) {
00251     PDGroupPhrase& pdgPhrase = phraseMap.find(phrase)->second;
00252 
00253     // Score order (example with 2 models)
00254     // member1_scores member2_scores [m1_pc m2_pc] [m1_wc m2_wc]
00255 
00256     // Extra scores added after member model scores
00257     size_t offset = m_totalModelScores;
00258     // Phrase count (per member model)
00259     if (m_phraseCounts) {
00260       for (size_t i = 0; i < m_numModels; ++i) {
00261         if (pdgPhrase.m_seenBy[i]) {
00262           pdgPhrase.m_scores[offset + i] = 1;
00263         }
00264       }
00265       offset += m_numModels;
00266     }
00267     // Word count (per member model)
00268     if (m_wordCounts) {
00269       size_t wc = pdgPhrase.m_targetPhrase->GetSize();
00270       for (size_t i = 0; i < m_numModels; ++i) {
00271         if (pdgPhrase.m_seenBy[i]) {
00272           pdgPhrase.m_scores[offset + i] = wc;
00273         }
00274       }
00275       offset += m_numModels;
00276     }
00277 
00278     // Model bitmap features (one feature per possible bitmap)
00279     // e.g. seen by models 1 and 3 but not 2 -> "101" fires
00280     if (m_modelBitmapCounts) {
00281       // Throws exception if someone tries to combine more than 64 models
00282       pdgPhrase.m_scores[offset + (pdgPhrase.m_seenBy.to_ulong() - 1)] = 1;
00283       offset += m_seenByAll.to_ulong();
00284     }
00285 
00286     // Average other-model scores to fill in defaults when models have not seen
00287     // this phrase
00288     if (m_defaultAverageOthers) {
00289       // Average seen scores
00290       if (pdgPhrase.m_seenBy != m_seenByAll) {
00291         vector<float> avgScores(m_scoresPerModel, 0);
00292         size_t seenBy = 0;
00293         offset = 0;
00294         // sum
00295         for (size_t i = 0; i < m_numModels; ++i) {
00296           if (pdgPhrase.m_seenBy[i]) {
00297             for (size_t j = 0; j < m_scoresPerModel; ++j) {
00298               avgScores[j] += pdgPhrase.m_scores[offset + j];
00299             }
00300             seenBy += 1;
00301           }
00302           offset += m_scoresPerModel;
00303         }
00304         // divide
00305         for (size_t j = 0; j < m_scoresPerModel; ++j) {
00306           avgScores[j] /= seenBy;
00307         }
00308         // copy
00309         offset = 0;
00310         for (size_t i = 0; i < m_numModels; ++i) {
00311           if (!pdgPhrase.m_seenBy[i]) {
00312             for (size_t j = 0; j < m_scoresPerModel; ++j) {
00313               pdgPhrase.m_scores[offset + j] = avgScores[j];
00314             }
00315           }
00316           offset += m_scoresPerModel;
00317         }
00318 #ifdef PT_UG
00319         // Also average LexicalReordering scores if specified
00320         // We don't necessarily have a lr-func for each model
00321         if (m_haveMmsaptLrFunc) {
00322           SPTR<Scores> avgLRScores;
00323           size_t seenBy = 0;
00324           // For each model
00325           for (size_t i = 0; i < m_numModels; ++i) {
00326             const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i];
00327             // Add if phrase seen and model has lr-func
00328             if (pdgPhrase.m_seenBy[i] && lrFunc != NULL) {
00329               const Scores* scores = pdgPhrase.m_targetPhrase->GetExtraScores(lrFunc);
00330               if (!avgLRScores) {
00331                 avgLRScores.reset(new Scores(*scores));
00332               } else {
00333                 for (size_t j = 0; j < scores->size(); ++j) {
00334                   (*avgLRScores)[j] += (*scores)[j];
00335                 }
00336               }
00337               seenBy += 1;
00338             }
00339           }
00340           // Make sure we have at least one lr-func
00341           if (avgLRScores) {
00342             // divide
00343             for (size_t j = 0; j < avgLRScores->size(); ++j) {
00344               (*avgLRScores)[j] /= seenBy;
00345             }
00346             // set
00347             for (size_t i = 0; i < m_numModels; ++i) {
00348               const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i];
00349               if (!pdgPhrase.m_seenBy[i] && lrFunc != NULL) {
00350                 pdgPhrase.m_targetPhrase->SetExtraScores(lrFunc, avgLRScores);
00351               }
00352             }
00353           }
00354         }
00355 #endif
00356       }
00357     }
00358 
00359     // Assign scores
00360     phrase->GetScoreBreakdown().Assign(this, pdgPhrase.m_scores);
00361     // Correct future cost estimates and total score
00362     phrase->EvaluateInIsolation(src, pd_feature_const);
00363     ret->Add(phrase);
00364   }
00365 
00366   return ret;
00367 }
00368 
00369 ChartRuleLookupManager*
00370 PhraseDictionaryGroup::
00371 CreateRuleLookupManager(const ChartParser &,
00372                         const ChartCellCollectionBase&, size_t)
00373 {
00374   UTIL_THROW(util::Exception, "Phrase table used in chart decoder");
00375 }
00376 
00377 //copied from PhraseDictionaryCompact; free memory allocated to TargetPhraseCollection (and each TargetPhrase) at end of sentence
00378 void PhraseDictionaryGroup::CacheForCleanup(TargetPhraseCollection::shared_ptr  tpc)
00379 {
00380   PhraseCache &ref = GetPhraseCache();
00381   ref.push_back(tpc);
00382 }
00383 
00384 void
00385 PhraseDictionaryGroup::
00386 CleanUpAfterSentenceProcessing(const InputType &source)
00387 {
00388   GetPhraseCache().clear();
00389   CleanUpComponentModels(source);
00390 }
00391 
00392 void PhraseDictionaryGroup::CleanUpComponentModels(const InputType &source)
00393 {
00394   for (size_t i = 0; i < m_numModels; ++i) {
00395     m_memberPDs[i]->CleanUpAfterSentenceProcessing(source);
00396   }
00397 }
00398 
00399 } //namespace