00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "PhraseDictionaryOnDisk.h"
00022 #include "moses/InputFileStream.h"
00023 #include "moses/StaticData.h"
00024 #include "moses/TargetPhraseCollection.h"
00025 #include "moses/InputPath.h"
00026 #include "moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h"
00027 #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h"
00028 #include "moses/TranslationTask.h"
00029
00030 #include "OnDiskPt/OnDiskWrapper.h"
00031 #include "OnDiskPt/Word.h"
00032
00033 #include "util/tokenize_piece.hh"
00034
00035 using namespace std;
00036
00037
00038 namespace Moses
00039 {
00040 PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
00041 : MyBase(line, true)
00042 , m_maxSpanDefault(NOT_FOUND)
00043 , m_maxSpanLabelled(NOT_FOUND)
00044 {
00045 ReadParameters();
00046 }
00047
00048 PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
00049 {
00050 }
00051
00052 void PhraseDictionaryOnDisk::Load(AllOptions::ptr const& opts)
00053 {
00054 m_options = opts;
00055 SetFeaturesToApply();
00056 }
00057
00058 ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
00059 const ChartParser &parser,
00060 const ChartCellCollectionBase &cellCollection,
00061 std::size_t )
00062 {
00063 return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
00064 GetImplementation(),
00065 m_input,
00066 m_output);
00067 }
00068
00069 OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
00070 {
00071 OnDiskPt::OnDiskWrapper* dict;
00072 dict = m_implementation.get();
00073 UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
00074 return *dict;
00075 }
00076
00077 const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const
00078 {
00079 OnDiskPt::OnDiskWrapper* dict;
00080 dict = m_implementation.get();
00081 UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
00082 return *dict;
00083 }
00084
00085 void PhraseDictionaryOnDisk::InitializeForInput(ttasksptr const& ttask)
00086 {
00087 InputType const& source = *ttask->GetSource();
00088 ReduceCache();
00089
00090 OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
00091 obj->BeginLoad(m_filePath);
00092
00093 UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
00094 "On-disk phrase table is version " << obj->GetMisc("Version")
00095 << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);
00096
00097 UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
00098 "On-disk phrase table has " << obj->GetMisc("NumSourceFactors") << " source factors."
00099 << ". The ini file specified " << m_input.size() << " source factors");
00100
00101 UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
00102 "On-disk phrase table has " << obj->GetMisc("NumTargetFactors") << " target factors."
00103 << ". The ini file specified " << m_output.size() << " target factors");
00104
00105 UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
00106 "On-disk phrase table has " << obj->GetMisc("NumScores") << " scores."
00107 << ". The ini file specified " << m_numScoreComponents << " scores");
00108
00109 m_implementation.reset(obj);
00110 }
00111
00112 void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
00113 {
00114 InputPathList::const_iterator iter;
00115 for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
00116 InputPath &inputPath = **iter;
00117 GetTargetPhraseCollectionBatch(inputPath);
00118 }
00119
00120
00121 for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
00122 InputPath &inputPath = **iter;
00123 const OnDiskPt::PhraseNode *ptNode = static_cast<const OnDiskPt::PhraseNode*>(inputPath.GetPtNode(*this));
00124 delete ptNode;
00125 }
00126
00127 }
00128
00129 void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const
00130 {
00131 OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
00132 const Phrase &phrase = inputPath.GetPhrase();
00133 const InputPath *prevInputPath = inputPath.GetPrevPath();
00134
00135 const OnDiskPt::PhraseNode *prevPtNode = NULL;
00136
00137 if (prevInputPath) {
00138 prevPtNode = static_cast<const OnDiskPt::PhraseNode*>(prevInputPath->GetPtNode(*this));
00139 } else {
00140
00141 assert(phrase.GetSize() == 1);
00142 prevPtNode = &wrapper.GetRootSourceNode();
00143 }
00144
00145
00146 if (!SatisfyBackoff(inputPath)) {
00147 return;
00148 }
00149
00150 if (prevPtNode) {
00151 Word lastWord = phrase.GetWord(phrase.GetSize() - 1);
00152 lastWord.OnlyTheseFactors(m_inputFactors);
00153 OnDiskPt::Word *lastWordOnDisk = ConvertFromMoses(wrapper, m_input, lastWord);
00154
00155 TargetPhraseCollection::shared_ptr tpc;
00156 if (lastWordOnDisk == NULL) {
00157
00158 inputPath.SetTargetPhrases(*this, tpc, NULL);
00159 } else {
00160 OnDiskPt::PhraseNode const* ptNode;
00161 ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper);
00162 if (ptNode) tpc = GetTargetPhraseCollection(ptNode);
00163 inputPath.SetTargetPhrases(*this, tpc, ptNode);
00164
00165 delete lastWordOnDisk;
00166 }
00167 }
00168 }
00169
00170 TargetPhraseCollection::shared_ptr
00171 PhraseDictionaryOnDisk::
00172 GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const
00173 {
00174 TargetPhraseCollection::shared_ptr ret;
00175
00176 CacheColl &cache = GetCache();
00177 size_t hash = (size_t) ptNode->GetFilePos();
00178
00179 CacheColl::iterator iter;
00180
00181 iter = cache.find(hash);
00182
00183 if (iter == cache.end()) {
00184
00185 ret = GetTargetPhraseCollectionNonCache(ptNode);
00186
00187 std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(ret, clock());
00188 cache[hash] = value;
00189 } else {
00190
00191 iter->second.second = clock();
00192 ret = iter->second.first;
00193 }
00194
00195 return ret;
00196 }
00197
00198 TargetPhraseCollection::shared_ptr
00199 PhraseDictionaryOnDisk::
00200 GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const
00201 {
00202 OnDiskPt::OnDiskWrapper& wrapper
00203 = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
00204
00205 vector<float> weightT = StaticData::Instance().GetWeights(this);
00206 OnDiskPt::Vocab &vocab = wrapper.GetVocab();
00207
00208 OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
00209 = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
00210 TargetPhraseCollection::shared_ptr targetPhrases
00211 = ConvertToMoses(targetPhrasesOnDisk, m_input, m_output, *this,
00212 weightT, vocab, false);
00213
00214
00215
00216 return targetPhrases;
00217 }
00218
00219 Moses::TargetPhraseCollection::shared_ptr
00220 PhraseDictionaryOnDisk::ConvertToMoses(
00221 const OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
00222 , const std::vector<Moses::FactorType> &inputFactors
00223 , const std::vector<Moses::FactorType> &outputFactors
00224 , const Moses::PhraseDictionary &phraseDict
00225 , const std::vector<float> &weightT
00226 , OnDiskPt::Vocab &vocab
00227 , bool isSyntax) const
00228 {
00229 Moses::TargetPhraseCollection::shared_ptr ret;
00230 ret.reset(new Moses::TargetPhraseCollection);
00231
00232 for (size_t i = 0; i < targetPhrasesOnDisk->GetSize(); ++i) {
00233 const OnDiskPt::TargetPhrase &tp = targetPhrasesOnDisk->GetTargetPhrase(i);
00234 Moses::TargetPhrase *mosesPhrase
00235 = ConvertToMoses(tp, inputFactors, outputFactors, vocab,
00236 phraseDict, weightT, isSyntax);
00237
00238
00239
00240
00241
00242
00243
00244
00245 ret->Add(mosesPhrase);
00246 }
00247
00248 ret->Sort(true, phraseDict.GetTableLimit());
00249
00250 return ret;
00251 }
00252
00253 Moses::TargetPhrase *PhraseDictionaryOnDisk::ConvertToMoses(const OnDiskPt::TargetPhrase &targetPhraseOnDisk
00254 , const std::vector<Moses::FactorType> &inputFactors
00255 , const std::vector<Moses::FactorType> &outputFactors
00256 , const OnDiskPt::Vocab &vocab
00257 , const Moses::PhraseDictionary &phraseDict
00258 , const std::vector<float> &weightT
00259 , bool isSyntax) const
00260 {
00261 Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict);
00262
00263
00264 size_t phraseSize = targetPhraseOnDisk.GetSize();
00265 UTIL_THROW_IF2(phraseSize == 0, "Target phrase cannot be empty");
00266 if (isSyntax) {
00267 --phraseSize;
00268 }
00269
00270 for (size_t pos = 0; pos < phraseSize; ++pos) {
00271 const OnDiskPt::Word &wordOnDisk = targetPhraseOnDisk.GetWord(pos);
00272 ConvertToMoses(wordOnDisk, outputFactors, vocab, ret->AddWord());
00273 }
00274
00275
00276
00277 Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
00278 std::set<std::pair<size_t, size_t> > alignmentInfo;
00279 const OnDiskPt::PhrasePtr sp = targetPhraseOnDisk.GetSourcePhrase();
00280 for (size_t ind = 0; ind < targetPhraseOnDisk.GetAlign().size(); ++ind) {
00281 const std::pair<size_t, size_t> &entry = targetPhraseOnDisk.GetAlign()[ind];
00282 alignmentInfo.insert(entry);
00283 size_t sourcePos = entry.first;
00284 size_t targetPos = entry.second;
00285
00286 if (targetPhraseOnDisk.GetWord(targetPos).IsNonTerminal()) {
00287 alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
00288 } else {
00289 alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
00290 }
00291
00292 }
00293 ret->SetAlignTerm(alignTerm);
00294 ret->SetAlignNonTerm(alignNonTerm);
00295
00296 if (isSyntax) {
00297 Moses::Word *lhsTarget = new Moses::Word(true);
00298 const OnDiskPt::Word &lhsOnDisk = targetPhraseOnDisk.GetWord(targetPhraseOnDisk.GetSize() - 1);
00299 ConvertToMoses(lhsOnDisk, outputFactors, vocab, *lhsTarget);
00300 ret->SetTargetLHS(lhsTarget);
00301 }
00302
00303
00304 Moses::Phrase mosesSP(Moses::Input);
00305 for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
00306 ConvertToMoses(sp->GetWord(pos), inputFactors, vocab, mosesSP.AddWord());
00307 }
00308
00309
00310 ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetScores());
00311
00312
00313 ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetSparseFeatures());
00314
00315
00316 ret->SetProperties(targetPhraseOnDisk.GetProperty());
00317
00318 ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
00319
00320 return ret;
00321 }
00322
00323 void PhraseDictionaryOnDisk::ConvertToMoses(
00324 const OnDiskPt::Word &wordOnDisk,
00325 const std::vector<Moses::FactorType> &outputFactorsVec,
00326 const OnDiskPt::Vocab &vocab,
00327 Moses::Word &overwrite) const
00328 {
00329 Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
00330 overwrite = Moses::Word(wordOnDisk.IsNonTerminal());
00331
00332 if (wordOnDisk.IsNonTerminal()) {
00333 const std::string &tok = vocab.GetString(wordOnDisk.GetVocabId());
00334 overwrite.SetFactor(0, factorColl.AddFactor(tok, wordOnDisk.IsNonTerminal()));
00335 } else {
00336
00337 util::TokenIter<util::SingleCharacter> tok(vocab.GetString(wordOnDisk.GetVocabId()), '|');
00338
00339 for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
00340 UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
00341 overwrite.SetFactor(*t, factorColl.AddFactor(*tok, wordOnDisk.IsNonTerminal()));
00342 }
00343 UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
00344 }
00345 }
00346
00347 OnDiskPt::Word *PhraseDictionaryOnDisk::ConvertFromMoses(OnDiskPt::OnDiskWrapper &wrapper, const std::vector<Moses::FactorType> &factorsVec
00348 , const Moses::Word &origWord) const
00349 {
00350 bool isNonTerminal = origWord.IsNonTerminal();
00351 OnDiskPt::Word *newWord = new OnDiskPt::Word(isNonTerminal);
00352
00353 util::StringStream strme;
00354
00355 size_t factorType = factorsVec[0];
00356 const Moses::Factor *factor = origWord.GetFactor(factorType);
00357 UTIL_THROW_IF2(factor == NULL, "Expecting factor " << factorType);
00358 strme << factor->GetString();
00359
00360 for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
00361 size_t factorType = factorsVec[ind];
00362 const Moses::Factor *factor = origWord.GetFactor(factorType);
00363 if (factor == NULL) {
00364
00365 break;
00366 }
00367 UTIL_THROW_IF2(factor == NULL,
00368 "Expecting factor " << factorType << " at position " << ind);
00369 strme << "|" << factor->GetString();
00370 }
00371
00372 bool found;
00373 uint64_t vocabId = wrapper.GetVocab().GetVocabId(strme.str(), found);
00374 if (!found) {
00375
00376 delete newWord;
00377 return NULL;
00378 } else {
00379 newWord->SetVocabId(vocabId);
00380 return newWord;
00381 }
00382
00383 }
00384
00385 void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value)
00386 {
00387 if (key == "max-span-default") {
00388 m_maxSpanDefault = Scan<size_t>(value);
00389 } else if (key == "max-span-labelled") {
00390 m_maxSpanLabelled = Scan<size_t>(value);
00391 } else {
00392 PhraseDictionary::SetParameter(key, value);
00393 }
00394 }
00395
00396
00397 }
00398