00001
00002
00003 #include "moses/FeatureVector.h"
00004 #include "moses/TranslationModel/PhraseDictionaryTree.h"
00005 #include "util/exception.hh"
00006 #include "moses/StaticData.h"
00007
00008 #include <map>
00009 #include <sstream>
00010 #include <iostream>
00011 #include <fstream>
00012 #include <string>
00013 #include <vector>
00014
00015
00016 namespace Moses
00017 {
00018
00019 template<typename T>
00020 std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
00021 {
00022 out<<x.size()<<" ";
00023 typename std::vector<T>::const_iterator iend=x.end();
00024 for(typename std::vector<T>::const_iterator i=x.begin(); i!=iend; ++i)
00025 out<<*i<<' ';
00026 return out;
00027 }
00028
00029
00030 class TgtCand
00031 {
00032 IPhrase e;
00033 Scores sc;
00034 std::string m_alignment;
00035 IPhrase fnames;
00036 std::vector<FValue> fvalues;
00037
00038 static const float SPARSE_FLAG;
00039
00040 public:
00041 TgtCand() {}
00042
00043 TgtCand(const IPhrase& a, const Scores& b , const std::string& alignment)
00044 : e(a)
00045 , sc(b)
00046 , m_alignment(alignment) {
00047 }
00048
00049 TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
00050
00051 TgtCand(FILE* f) {
00052 readBin(f);
00053 }
00054
00055
00056 void writeBin(FILE* f) const {
00057 fWriteVector(f,e);
00058
00059
00060
00061 if (fnames.size()) {
00062 Scores sc_copy(sc);
00063 sc_copy.push_back(SPARSE_FLAG);
00064 fWriteVector(f,sc_copy);
00065 fWriteVector(f,fnames);
00066 fWriteVector(f,fvalues);
00067 } else {
00068 fWriteVector(f,sc);
00069 }
00070 }
00071
00072 void readBin(FILE* f) {
00073 fReadVector(f,e);
00074 fReadVector(f,sc);
00075 if (sc.back() == 100) {
00076 sc.pop_back();
00077 fReadVector(f,fnames);
00078 fReadVector(f,fvalues);
00079 }
00080 }
00081
00082 void writeBinWithAlignment(FILE* f) const {
00083 writeBin(f);
00084 fWriteString(f, m_alignment.c_str(), m_alignment.size());
00085 }
00086
00087 void readBinWithAlignment(FILE* f) {
00088 readBin(f);
00089 fReadString(f, m_alignment);
00090 }
00091
00092 const IPhrase& GetPhrase() const {
00093 return e;
00094 }
00095 const Scores& GetScores() const {
00096 return sc;
00097 }
00098 const std::string& GetAlignment() const {
00099 return m_alignment;
00100 }
00101
00102 const IPhrase& GetFeatureNames() const {
00103 return fnames;
00104 }
00105
00106 const std::vector<FValue> GetFeatureValues() const {
00107 return fvalues;
00108 }
00109
00110 void SetFeatures(const IPhrase& names, const std::vector<FValue>& values) {
00111 UTIL_THROW_IF2(names.size() != values.size(), "Error");
00112 fnames = names;
00113 fvalues = values;
00114 }
00115 };
00116
00117 const float TgtCand::SPARSE_FLAG = 100;
00118
00119
00120 class TgtCands : public std::vector<TgtCand>
00121 {
00122 typedef std::vector<TgtCand> MyBase;
00123 public:
00124 TgtCands() : MyBase() {}
00125
00126 void writeBin(FILE* f) const {
00127 unsigned s=size();
00128 fWrite(f,s);
00129 for(size_t i=0; i<s; ++i) MyBase::operator[](i).writeBin(f);
00130 }
00131
00132 void writeBinWithAlignment(FILE* f) const {
00133 unsigned s=size();
00134 fWrite(f,s);
00135 for(size_t i=0; i<s; ++i) MyBase::operator[](i).writeBinWithAlignment(f);
00136 }
00137
00138 void readBin(FILE* f) {
00139 unsigned s;
00140 fRead(f,s);
00141 resize(s);
00142 for(size_t i=0; i<s; ++i) MyBase::operator[](i).readBin(f);
00143 }
00144
00145 void readBinWithAlignment(FILE* f) {
00146 unsigned s;
00147 fRead(f,s);
00148 resize(s);
00149 for(size_t i=0; i<s; ++i) MyBase::operator[](i).readBinWithAlignment(f);
00150 }
00151 };
00152
00153
00154 PhraseDictionaryTree::PrefixPtr::operator bool() const
00155 {
00156 return imp && imp->isValid();
00157 }
00158
00159 typedef LVoc<std::string> WordVoc;
00160
00161
00162 class PDTimp
00163 {
00164 public:
00165 typedef PrefixTreeF<LabelId,OFF_T> PTF;
00166 typedef FilePtr<PTF> CPT;
00167 typedef std::vector<CPT> Data;
00168
00169
00170 Data data;
00171 std::vector<OFF_T> srcOffsets;
00172
00173 FILE *os,*ot;
00174 WordVoc sv;
00175 WordVoc tv;
00176
00177 ObjectPool<PPimp> pPool;
00178
00179
00180 bool needwordalign, haswordAlign;
00181 bool printwordalign;
00182
00183 PDTimp() : os(0),ot(0), printwordalign(false) {
00184 PTF::setDefault(InvalidOffT);
00185 }
00186 ~PDTimp() {
00187 if(os) fClose(os);
00188 if(ot) fClose(ot);
00189 FreeMemory();
00190 }
00191
00192 inline void NeedAlignmentInfo(bool a) {
00193 needwordalign=a;
00194 }
00195 inline bool NeedAlignmentInfo() {
00196 return needwordalign;
00197 };
00198 inline void HasAlignmentInfo(bool a) {
00199 haswordAlign=a;
00200 }
00201 inline bool HasAlignmentInfo() {
00202 return haswordAlign;
00203 };
00204
00205 inline void PrintWordAlignment(bool a) {
00206 printwordalign=a;
00207 };
00208 inline bool PrintWordAlignment() {
00209 return printwordalign;
00210 };
00211
00212 void FreeMemory() {
00213 for(Data::iterator i=data.begin(); i!=data.end(); ++i) (*i).free();
00214 pPool.reset();
00215 }
00216
00217 int Read(const std::string& fn);
00218
00219 void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands) {
00220 if(f.empty()) return;
00221 if(f[0]>=data.size()) return;
00222 if(!data[f[0]]) return;
00223 assert(data[f[0]]->findKey(f[0])<data[f[0]]->size());
00224 OFF_T tCandOffset=data[f[0]]->find(f);
00225 if(tCandOffset==InvalidOffT) return;
00226 fSeek(ot,tCandOffset);
00227
00228 if (HasAlignmentInfo())
00229 tgtCands.readBinWithAlignment(ot);
00230 else
00231 tgtCands.readBin(ot);
00232 }
00233
00234 typedef PhraseDictionaryTree::PrefixPtr PPtr;
00235
00236 void GetTargetCandidates(PPtr p,TgtCands& tgtCands) {
00237 UTIL_THROW_IF2(p == 0L, "Error");
00238
00239
00240 if(p.imp->isRoot()) return;
00241 OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx);
00242 if(tCandOffset==InvalidOffT) return;
00243 fSeek(ot,tCandOffset);
00244 if (HasAlignmentInfo())
00245 tgtCands.readBinWithAlignment(ot);
00246 else
00247 tgtCands.readBin(ot);
00248 }
00249
00250 void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const;
00251
00252
00253 void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& extTgtCands,
00254 std::vector<std::string>* wa) const {
00255 extTgtCands.reserve(tcands.size());
00256 for(TgtCands::const_iterator iter=tcands.begin(); iter!=tcands.end(); ++iter) {
00257 const TgtCand &intTgtCand = *iter;
00258
00259 extTgtCands.push_back(StringTgtCand());
00260 StringTgtCand &extTgtCand = extTgtCands.back();
00261
00262 const IPhrase& iphrase = intTgtCand.GetPhrase();
00263
00264 extTgtCand.tokens.reserve(iphrase.size());
00265 for(size_t j=0; j<iphrase.size(); ++j) {
00266 extTgtCand.tokens.push_back(&tv.symbol(iphrase[j]));
00267 }
00268 extTgtCand.scores = intTgtCand.GetScores();
00269 const IPhrase& fnames = intTgtCand.GetFeatureNames();
00270 for (size_t j = 0; j < fnames.size(); ++j) {
00271 extTgtCand.fnames.push_back(&tv.symbol(fnames[j]));
00272 }
00273 extTgtCand.fvalues = intTgtCand.GetFeatureValues();
00274 if (wa) wa->push_back(intTgtCand.GetAlignment());
00275 }
00276 }
00277
00278 PPtr GetRoot() {
00279 return PPtr(pPool.get(PPimp(0,0,1)));
00280 }
00281
00282 PPtr Extend(PPtr p,const std::string& w) {
00283 UTIL_THROW_IF2(p == 0L, "Error");
00284
00285
00286 if(w.empty() || w==EPSILON) return p;
00287
00288 LabelId wi=sv.index(w);
00289
00290 if(wi==InvalidLabelId) return PPtr();
00291 else if(p.imp->isRoot()) {
00292 if(wi<data.size() && data[wi]) {
00293 const void* ptr = data[wi]->findKeyPtr(wi);
00294 UTIL_THROW_IF2(ptr == NULL, "Error");
00295
00296 return PPtr(pPool.get(PPimp(data[wi],data[wi]->findKey(wi),0)));
00297 }
00298 } else if(PTF const* nextP=p.imp->ptr()->getPtr(p.imp->idx)) {
00299 return PPtr(pPool.get(PPimp(nextP,nextP->findKey(wi),0)));
00300 }
00301
00302 return PPtr();
00303 }
00304
00305 WordVoc* ReadVoc(const std::string& filename);
00306 };
00307
00308
00310
00311
00312
00314
00315 int PDTimp::Read(const std::string& fn)
00316 {
00317 std::string ifs, ift, ifi, ifsv, iftv;
00318
00319 HasAlignmentInfo(FileExists(fn+".binphr.srctree.wa"));
00320
00321 if (NeedAlignmentInfo() && !HasAlignmentInfo()) {
00322
00323 std::cerr << "You are asking for word alignment but the binary phrase table does not contain any alignment info. Please check if you had generated the correct phrase table with word alignment (.wa)\n";
00324 return false;
00325 }
00326
00327 if (HasAlignmentInfo()) {
00328 ifs=fn+".binphr.srctree.wa";
00329 ift=fn+".binphr.tgtdata.wa";
00330 } else {
00331 ifs=fn+".binphr.srctree";
00332 ift=fn+".binphr.tgtdata";
00333 }
00334
00335 ifi=fn+".binphr.idx";
00336 ifsv=fn+".binphr.srcvoc";
00337 iftv=fn+".binphr.tgtvoc";
00338
00339 FILE *ii=fOpen(ifi.c_str(),"rb");
00340 fReadVector(ii,srcOffsets);
00341 fClose(ii);
00342
00343 os=fOpen(ifs.c_str(),"rb");
00344 ot=fOpen(ift.c_str(),"rb");
00345
00346 data.resize(srcOffsets.size());
00347 for(size_t i=0; i<data.size(); ++i)
00348 data[i]=CPT(os,srcOffsets[i]);
00349
00350 sv.Read(ifsv);
00351 tv.Read(iftv);
00352
00353 VERBOSE(1,"binary phrasefile loaded, default OFF_T: "
00354 <<PTF::getDefault() <<"\n");
00355 return 1;
00356 }
00357
00358 void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
00359 {
00360 for(size_t i=0; i<tcand.size(); ++i) {
00361
00362 Scores sc=tcand[i].GetScores();
00363 std::string trgAlign = tcand[i].GetAlignment();
00364
00365 const IPhrase& iphr=tcand[i].GetPhrase();
00366
00367 out << i << " -- " << sc << " -- ";
00368 for(size_t j=0; j<iphr.size(); ++j) out << tv.symbol(iphr[j])<<" ";
00369 out<< " -- " << trgAlign;
00370 out << std::endl;
00371 }
00372 }
00373
00375
00376
00377
00379
00380 PhraseDictionaryTree::PhraseDictionaryTree()
00381 : imp(new PDTimp)
00382 {
00383 if(sizeof(OFF_T)!=8) {
00384 UTIL_THROW2("ERROR: size of type 'OFF_T' has to be 64 bit!\n"
00385 "In gcc, use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n");
00386 }
00387 }
00388
00389 PhraseDictionaryTree::~PhraseDictionaryTree()
00390 {
00391 delete imp;
00392 }
00393
00394 void PhraseDictionaryTree::NeedAlignmentInfo(bool a)
00395 {
00396 imp->NeedAlignmentInfo(a);
00397 };
00398 void PhraseDictionaryTree::PrintWordAlignment(bool a)
00399 {
00400 imp->PrintWordAlignment(a);
00401 };
00402 bool PhraseDictionaryTree::PrintWordAlignment()
00403 {
00404 return imp->PrintWordAlignment();
00405 };
00406
00407 void PhraseDictionaryTree::FreeMemory() const
00408 {
00409 imp->FreeMemory();
00410 }
00411
00412
00413 void PhraseDictionaryTree::
00414 GetTargetCandidates(const std::vector<std::string>& src,
00415 std::vector<StringTgtCand>& rv) const
00416 {
00417 IPhrase f(src.size());
00418 for(size_t i=0; i<src.size(); ++i) {
00419 f[i]=imp->sv.index(src[i]);
00420 if(f[i]==InvalidLabelId) return;
00421 }
00422
00423 TgtCands tgtCands;
00424 imp->GetTargetCandidates(f,tgtCands);
00425 imp->ConvertTgtCand(tgtCands,rv,NULL);
00426 }
00427
00428 void PhraseDictionaryTree::
00429 GetTargetCandidates(const std::vector<std::string>& src,
00430 std::vector<StringTgtCand>& rv,
00431 std::vector<std::string>& wa) const
00432 {
00433 IPhrase f(src.size());
00434 for(size_t i=0; i<src.size(); ++i) {
00435 f[i]=imp->sv.index(src[i]);
00436 if(f[i]==InvalidLabelId) return;
00437 }
00438
00439 TgtCands tgtCands;
00440 imp->GetTargetCandidates(f,tgtCands);
00441 imp->ConvertTgtCand(tgtCands,rv,&wa);
00442 }
00443
00444
00445 void PhraseDictionaryTree::
00446 PrintTargetCandidates(const std::vector<std::string>& src,
00447 std::ostream& out) const
00448 {
00449 IPhrase f(src.size());
00450 for(size_t i=0; i<src.size(); ++i) {
00451 f[i]=imp->sv.index(src[i]);
00452 if(f[i]==InvalidLabelId) {
00453 TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '"
00454 <<src[i]<<"'\n");
00455 return;
00456 }
00457 }
00458
00459 TgtCands tcand;
00460 imp->GetTargetCandidates(f,tcand);
00461 imp->PrintTgtCand(tcand,out);
00462 }
00463
00464 int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
00465 {
00466 std::string line;
00467 size_t count = 0;
00468
00469 std::string ofn(out+".binphr.srctree"),
00470 oft(out+".binphr.tgtdata"),
00471 ofi(out+".binphr.idx"),
00472 ofsv(out+".binphr.srcvoc"),
00473 oftv(out+".binphr.tgtvoc");
00474
00475 if (PrintWordAlignment()) {
00476 ofn+=".wa";
00477 oft+=".wa";
00478 }
00479
00480 FILE *os=fOpen(ofn.c_str(),"wb"),
00481 *ot=fOpen(oft.c_str(),"wb");
00482
00483 typedef PrefixTreeSA<LabelId,OFF_T> PSA;
00484 PSA *psa=new PSA;
00485 PSA::setDefault(InvalidOffT);
00486
00487 LabelId currFirstWord=InvalidLabelId;
00488 IPhrase currF;
00489 TgtCands tgtCands;
00490 std::vector<OFF_T> vo;
00491 size_t lnc=0;
00492 size_t numElement = NOT_FOUND;
00493 size_t missingAlignmentCount = 0;
00494
00495 while(getline(inFile, line)) {
00496 ++lnc;
00497
00498 std::vector<std::string> tokens = TokenizeMultiCharSeparator( line , "|||" );
00499
00500 if (numElement == NOT_FOUND) {
00501
00502 numElement = tokens.size();
00503 UTIL_THROW_IF2(numElement < (PrintWordAlignment()?4:3),
00504 "Format error");
00505 }
00506
00507 if (tokens.size() != numElement) {
00508 UTIL_THROW2("Syntax error at line " << lnc << " : " << line);
00509 }
00510
00511 const std::string &sourcePhraseString =tokens[0]
00512 ,&targetPhraseString=tokens[1]
00513 ,&scoreString = tokens[2];
00514 const std::string empty;
00515 const std::string &alignmentString = PrintWordAlignment() ? tokens[3] : empty;
00516 const std::string sparseFeatureString = tokens.size() > 5 ? tokens[5] : empty;
00517 IPhrase f,e;
00518 Scores sc;
00519
00520 if (PrintWordAlignment() && alignmentString == " ") ++missingAlignmentCount;
00521
00522 std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
00523 for (size_t i = 0 ; i < wordVec.size() ; ++i)
00524 f.push_back(imp->sv.add(wordVec[i]));
00525
00526 wordVec = Tokenize(targetPhraseString);
00527 for (size_t i = 0 ; i < wordVec.size() ; ++i)
00528 e.push_back(imp->tv.add(wordVec[i]));
00529
00530
00531
00532 std::vector<float> scoreVector = Tokenize<float>(scoreString);
00533 for (size_t i = 0 ; i < scoreVector.size() ; ++i) {
00534 float tmp = scoreVector[i];
00535 sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38));
00536 }
00537
00538 if(f.empty()) {
00539 TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
00540 continue;
00541 }
00542
00543 if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
00544 if(currF.empty()) {
00545 ++count;
00546 currF=f;
00547
00548 UTIL_THROW_IF2(psa == NULL, "Error");
00549
00550 PSA::Data& d=psa->insert(f);
00551 if(d==InvalidOffT) d=fTell(ot);
00552 else {
00553 UTIL_THROW2("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
00554 <<line);
00555 }
00556 }
00557
00558 IPhrase fnames;
00559 std::vector<FValue> fvalues;
00560 if (!sparseFeatureString.empty()) {
00561 std::vector<std::string> sparseTokens = Tokenize(sparseFeatureString);
00562 if (sparseTokens.size() % 2 != 0) {
00563 UTIL_THROW2("ERROR: incorrectly formatted sparse feature string: " <<
00564 sparseFeatureString);
00565 }
00566 for (size_t i = 0; i < sparseTokens.size(); i+=2) {
00567 fnames.push_back(imp->tv.add(sparseTokens[i]));
00568 fvalues.push_back(Scan<FValue>(sparseTokens[i+1]));
00569 }
00570 }
00571
00572 if(currF!=f) {
00573
00574 currF=f;
00575 if (PrintWordAlignment())
00576 tgtCands.writeBinWithAlignment(ot);
00577 else
00578 tgtCands.writeBin(ot);
00579 tgtCands.clear();
00580
00581 if(++count%10000==0) {
00582 TRACE_ERR(".");
00583 if(count%500000==0) TRACE_ERR("[phrase:"<<count<<"]\n");
00584 }
00585
00586 if(f[0]!=currFirstWord) {
00587
00588 PTF pf;
00589 if(currFirstWord>=vo.size())
00590 vo.resize(currFirstWord+1,InvalidOffT);
00591 vo[currFirstWord]=fTell(os);
00592 pf.create(*psa,os);
00593
00594 delete psa;
00595 psa=new PSA;
00596 currFirstWord=f[0];
00597 }
00598
00599
00600 UTIL_THROW_IF2(psa == NULL, "Error");
00601
00602 PSA::Data& d=psa->insert(f);
00603 if(d==InvalidOffT) d=fTell(ot);
00604 else {
00605 UTIL_THROW2("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
00606 <<line);
00607 }
00608 }
00609 tgtCands.push_back(TgtCand(e,sc, alignmentString));
00610 UTIL_THROW_IF2(currFirstWord == InvalidLabelId,
00611 "Uninitialize word");
00612 tgtCands.back().SetFeatures(fnames, fvalues);
00613 }
00614 if (PrintWordAlignment())
00615 tgtCands.writeBinWithAlignment(ot);
00616 else
00617 tgtCands.writeBin(ot);
00618 tgtCands.clear();
00619
00620 PTF pf;
00621 if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
00622 vo[currFirstWord]=fTell(os);
00623 pf.create(*psa,os);
00624 delete psa;
00625 psa=0;
00626
00627 TRACE_ERR("distinct source phrases: "<<count
00628 <<" distinct first words of source phrases: "<<vo.size()
00629 <<" number of phrase pairs (line count): "<<lnc
00630 <<"\n");
00631
00632 if ( PrintWordAlignment()) {
00633 TRACE_ERR("Count of lines with missing alignments: " <<
00634 missingAlignmentCount << "/" << lnc << "\n");
00635 }
00636
00637 fClose(os);
00638 fClose(ot);
00639
00640 std::vector<size_t> inv;
00641 for(size_t i=0; i<vo.size(); ++i)
00642 if(vo[i]==InvalidOffT) inv.push_back(i);
00643
00644 if(inv.size()) {
00645 TRACE_ERR("WARNING: there are src voc entries with no phrase "
00646 "translation: count "<<inv.size()<<"\n"
00647 "There exists phrase translations for "<<vo.size()-inv.size()
00648 <<" entries\n");
00649 }
00650
00651 FILE *oi=fOpen(ofi.c_str(),"wb");
00652 fWriteVector(oi,vo);
00653 fClose(oi);
00654
00655 imp->sv.Write(ofsv);
00656 imp->tv.Write(oftv);
00657
00658 return 1;
00659 }
00660
00661
00662 int PhraseDictionaryTree::Read(const std::string& fn)
00663 {
00664 VERBOSE(1,"size of OFF_T "<<sizeof(OFF_T)<<"\n");
00665 return imp->Read(fn);
00666 }
00667
00668
00669 PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::GetRoot() const
00670 {
00671 return imp->GetRoot();
00672 }
00673
00674 PhraseDictionaryTree::PrefixPtr
00675 PhraseDictionaryTree::Extend(PrefixPtr p, const std::string& w) const
00676 {
00677 return imp->Extend(p,w);
00678 }
00679
00680 void PhraseDictionaryTree::PrintTargetCandidates(PrefixPtr p,std::ostream& out) const
00681 {
00682
00683 TgtCands tcand;
00684 imp->GetTargetCandidates(p,tcand);
00685 out<<"there are "<<tcand.size()<<" target candidates\n";
00686 imp->PrintTgtCand(tcand,out);
00687 }
00688
00689 void PhraseDictionaryTree::
00690 GetTargetCandidates(PrefixPtr p,
00691 std::vector<StringTgtCand>& rv) const
00692 {
00693 TgtCands tcands;
00694 imp->GetTargetCandidates(p,tcands);
00695 imp->ConvertTgtCand(tcands,rv,NULL);
00696 }
00697
00698 void PhraseDictionaryTree::
00699 GetTargetCandidates(PrefixPtr p,
00700 std::vector<StringTgtCand>& rv,
00701 std::vector<std::string>& wa) const
00702 {
00703 TgtCands tcands;
00704 imp->GetTargetCandidates(p,tcands);
00705 imp->ConvertTgtCand(tcands,rv,&wa);
00706 }
00707
00708 }
00709