00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <sstream>
00021 #include "ExtractionPhrasePair.h"
00022 #include "tables-core.h"
00023 #include "score.h"
00024 #include "moses/Util.h"
00025
00026 #include <cstdlib>
00027
00028 using namespace std;
00029
00030
00031 namespace MosesTraining
00032 {
00033
00034
00035 extern Vocabulary vcbT;
00036 extern Vocabulary vcbS;
00037
00038 extern bool hierarchicalFlag;
00039
00040
00041 ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
00042 const PHRASE *phraseTarget,
00043 ALIGNMENT *targetToSourceAlignment,
00044 float count, float pcfgSum ) :
00045 m_phraseSource(phraseSource),
00046 m_phraseTarget(phraseTarget),
00047 m_count(count),
00048 m_pcfgSum(pcfgSum)
00049 {
00050 assert(!phraseSource->empty());
00051
00052 m_count = count;
00053 m_pcfgSum = pcfgSum;
00054
00055 std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
00056 m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
00057
00058 m_lastTargetToSourceAlignment = insertedAlignment.first;
00059 m_lastCount = m_count;
00060 m_lastPcfgSum = m_pcfgSum;
00061
00062 m_isValid = true;
00063 }
00064
00065
00066 ExtractionPhrasePair::~ExtractionPhrasePair( )
00067 {
00068 Clear();
00069 }
00070
00071
00072
00073
00074 bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
00075 float count, float pcfgSum )
00076 {
00077 m_count += count;
00078 m_pcfgSum += pcfgSum;
00079
00080 m_lastCount = count;
00081 m_lastPcfgSum = pcfgSum;
00082
00083 std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
00084 if ( *(iter->first) == *targetToSourceAlignment ) {
00085 iter->second += count;
00086 return false;
00087 } else {
00088 std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
00089 m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
00090 if ( !insertedAlignment.second ) {
00091
00092 insertedAlignment.first->second += count;
00093 return false;
00094 }
00095 m_lastTargetToSourceAlignment = insertedAlignment.first;
00096 }
00097
00098 return true;
00099 }
00100
00101
00102 void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
00103 {
00104 m_count += count;
00105 m_pcfgSum += pcfgSum;
00106 m_lastTargetToSourceAlignment->second += count;
00107
00108 for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
00109 iter !=m_properties.end(); ++iter ) {
00110 LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
00111 (*lastPropertyValue)->second += count;
00112 }
00113
00114 m_lastCount = count;
00115 m_lastPcfgSum = pcfgSum;
00116 }
00117
00118
00119
00120
00121 bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
00122 const PHRASE *otherPhraseTarget,
00123 ALIGNMENT *otherTargetToSourceAlignment ) const
00124 {
00125 if (*otherPhraseTarget != *m_phraseTarget) {
00126 return false;
00127 }
00128 if (*otherPhraseSource != *m_phraseSource) {
00129 return false;
00130 }
00131
00132 return MatchesAlignment( otherTargetToSourceAlignment );
00133 }
00134
00135
00136
00137
00138
00139
00140 bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
00141 const PHRASE *otherPhraseTarget,
00142 ALIGNMENT *otherTargetToSourceAlignment,
00143 bool &sourceMatch,
00144 bool &targetMatch,
00145 bool &alignmentMatch ) const
00146 {
00147 if (*otherPhraseSource != *m_phraseSource) {
00148 sourceMatch = false;
00149 return false;
00150 } else {
00151 sourceMatch = true;
00152 }
00153 if (*otherPhraseTarget != *m_phraseTarget) {
00154 targetMatch = false;
00155 return false;
00156 } else {
00157 targetMatch = true;
00158 }
00159 if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
00160 alignmentMatch = false;
00161 return false;
00162 } else {
00163 alignmentMatch = true;
00164 }
00165 return true;
00166 }
00167
00168
00169
00170 bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
00171 {
00172 if (!hierarchicalFlag) return true;
00173
00174
00175 const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
00176
00177 assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
00178 assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
00179
00180
00181 for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
00182 if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
00183 size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
00184 size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
00185
00186 if (thisTargetToSourceAlignment->at(i).size() != 1 ||
00187 otherTargetToSourceAlignment->at(i).size() != 1 ||
00188 thisAlign != otherAlign) {
00189 return false;
00190 }
00191 }
00192 }
00193
00194 return true;
00195 }
00196
00197 void ExtractionPhrasePair::Clear()
00198 {
00199 delete m_phraseSource;
00200 delete m_phraseTarget;
00201
00202 m_count = 0.0f;
00203 m_pcfgSum = 0.0f;
00204
00205 for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
00206 iter!=m_targetToSourceAlignments.end(); ++iter) {
00207 delete iter->first;
00208 }
00209 m_targetToSourceAlignments.clear();
00210
00211 for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
00212 iter!=m_properties.end(); ++iter) {
00213 delete (iter->second).second;
00214 delete (iter->second).first;
00215 }
00216 m_properties.clear();
00217
00218 m_lastCount = 0.0f;
00219 m_lastPcfgSum = 0.0f;
00220 m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
00221
00222 m_isValid = false;
00223 }
00224
00225
00226 void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
00227 {
00228 if (propertiesString.empty()) {
00229 return;
00230 }
00231
00232 vector<std::string> toks;
00233 Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
00234 for (size_t i = 1; i < toks.size(); ++i) {
00235 std::string &tok = toks[i];
00236 if (tok.empty()) {
00237 continue;
00238 }
00239 size_t endPos = tok.rfind("}");
00240 tok = tok.substr(0, endPos - 1);
00241
00242 vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
00243 if (keyValue.size() == 2) {
00244 AddProperty(keyValue[0], keyValue[1], count);
00245 }
00246 }
00247 }
00248
00249
00250 const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
00251 {
00252 float bestAlignmentCount = -1;
00253
00254 std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
00255
00256 for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
00257 iter!=m_targetToSourceAlignments.end(); ++iter) {
00258 if ( (iter->second > bestAlignmentCount) ||
00259 ( (iter->second == bestAlignmentCount) &&
00260 (*(iter->first) > *(bestAlignment->first)) ) ) {
00261 bestAlignmentCount = iter->second;
00262 bestAlignment = iter;
00263 }
00264 }
00265
00266 if ( bestAlignment == m_targetToSourceAlignments.end()) {
00267 return NULL;
00268 }
00269
00270 return bestAlignment->first;
00271 }
00272
00273
00274 const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
00275 {
00276 float bestPropertyCount = -1;
00277
00278 const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
00279 if ( allPropertyValues == NULL ) {
00280 return NULL;
00281 }
00282
00283 PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
00284
00285 for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
00286 iter!=allPropertyValues->end(); ++iter) {
00287 if ( (iter->second > bestPropertyCount) ||
00288 ( (iter->second == bestPropertyCount) &&
00289 (iter->first > bestPropertyValue->first) ) ) {
00290 bestPropertyCount = iter->second;
00291 bestPropertyValue = iter;
00292 }
00293 }
00294
00295 if ( bestPropertyValue == allPropertyValues->end()) {
00296 return NULL;
00297 }
00298
00299 return &(bestPropertyValue->first);
00300 }
00301
00302
00303 std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
00304 {
00305 const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
00306
00307 if ( allPropertyValues == NULL ) {
00308 return "";
00309 }
00310
00311 std::ostringstream oss;
00312 for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
00313 iter!=allPropertyValues->end(); ++iter) {
00314 if (!(iter->first).empty()) {
00315 if (iter!=allPropertyValues->begin()) {
00316 oss << " ";
00317 }
00318 oss << iter->first;
00319 oss << " ";
00320 oss << iter->second;
00321 }
00322 }
00323
00324 std::string allPropertyValuesString(oss.str());
00325 return allPropertyValuesString;
00326 }
00327
00328
00329 std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
00330 std::set<std::string>& labelSet,
00331 boost::unordered_map<std::string,float>& countsLabelsLHS,
00332 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
00333 Vocabulary &vcbT) const
00334 {
00335 const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
00336
00337 if ( allPropertyValues == NULL ) {
00338 return "";
00339 }
00340
00341 std::string lhs="", rhs="", currentRhs="";
00342 float currentRhsCount = 0.0;
00343 std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
00344
00345 std::ostringstream oss;
00346 for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
00347 iter!=allPropertyValues->end(); ++iter) {
00348
00349 size_t space = (iter->first).find_last_of(' ');
00350 if ( space == string::npos ) {
00351 lhs = iter->first;
00352 rhs.clear();
00353 } else {
00354 lhs = (iter->first).substr(space+1);
00355 rhs = (iter->first).substr(0,space);
00356 }
00357
00358 labelSet.insert(lhs);
00359
00360 if ( rhs.compare(currentRhs) ) {
00361
00362 if ( iter!=allPropertyValues->begin() ) {
00363 if ( !currentRhs.empty() ) {
00364 istringstream tokenizer(currentRhs);
00365 std::string rhsLabel;
00366 while ( tokenizer.peek() != EOF ) {
00367 tokenizer >> rhsLabel;
00368 labelSet.insert(rhsLabel);
00369 }
00370 oss << " " << currentRhs << " " << currentRhsCount;
00371 }
00372 if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
00373 if ( !currentRhs.empty() ) {
00374 oss << " " << lhsGivenCurrentRhsCounts.size();
00375 }
00376 for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
00377 iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
00378 oss << " " << iter2->first << " " << iter2->second;
00379
00380
00381 std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
00382 ruleTargetLhs.erase(ruleTargetLhs.begin());
00383 ruleTargetLhs.erase(ruleTargetLhs.size()-1);
00384
00385 std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
00386 countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
00387 if (!insertedCountsLabelsLHS.second) {
00388 (insertedCountsLabelsLHS.first)->second += iter2->second;
00389 }
00390
00391 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
00392 jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
00393 if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
00394 boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
00395 jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
00396 jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
00397 } else {
00398 boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
00399 std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
00400 jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
00401 if (!insertedJointCounts.second) {
00402 (insertedJointCounts.first)->second += iter2->second;
00403 }
00404 }
00405
00406 }
00407 }
00408
00409 lhsGivenCurrentRhsCounts.clear();
00410 }
00411
00412 currentRhsCount = 0.0;
00413 currentRhs = rhs;
00414 }
00415
00416 currentRhsCount += iter->second;
00417 lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
00418 }
00419
00420 if ( !currentRhs.empty() ) {
00421 istringstream tokenizer(currentRhs);
00422 std::string rhsLabel;
00423 while ( tokenizer.peek() != EOF ) {
00424 tokenizer >> rhsLabel;
00425 labelSet.insert(rhsLabel);
00426 }
00427 oss << " " << currentRhs << " " << currentRhsCount;
00428 }
00429 if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
00430 if ( !currentRhs.empty() ) {
00431 oss << " " << lhsGivenCurrentRhsCounts.size();
00432 }
00433 for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
00434 iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
00435 oss << " " << iter2->first << " " << iter2->second;
00436
00437
00438 std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
00439 ruleTargetLhs.erase(ruleTargetLhs.begin());
00440 ruleTargetLhs.erase(ruleTargetLhs.size()-1);
00441
00442 std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
00443 countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
00444 if (!insertedCountsLabelsLHS.second) {
00445 (insertedCountsLabelsLHS.first)->second += iter2->second;
00446 }
00447
00448 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
00449 jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
00450 if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
00451 boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
00452 jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
00453 jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
00454 } else {
00455 boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
00456 std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
00457 jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
00458 if (!insertedJointCounts.second) {
00459 (insertedJointCounts.first)->second += iter2->second;
00460 }
00461 }
00462
00463 }
00464 }
00465
00466 std::string allPropertyValuesString(oss.str());
00467 return allPropertyValuesString;
00468 }
00469
00470
00471 void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
00472 const std::vector<float> &orientationClassPriorsL2R,
00473 const std::vector<float> &orientationClassPriorsR2L,
00474 double smoothingFactor,
00475 std::ostream &out) const
00476 {
00477 assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4);
00478
00479 const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
00480
00481 if ( allPropertyValues == NULL ) {
00482 return;
00483 }
00484
00485
00486
00487 std::vector<float> orientationClassCountSumL2R(4,0);
00488 std::vector<float> orientationClassCountSumR2L(4,0);
00489
00490 for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
00491 iter!=allPropertyValues->end(); ++iter) {
00492 std::string l2rOrientationClass, r2lOrientationClass;
00493 try {
00494 istringstream tokenizer(iter->first);
00495 tokenizer >> l2rOrientationClass;
00496 tokenizer >> r2lOrientationClass;
00497 if ( tokenizer.peek() != EOF ) {
00498 UTIL_THROW(util::Exception, "ExtractionPhrasePair"
00499 << ": Collecting phrase orientations failed. "
00500 << "Too many tokens?");
00501 }
00502 } catch (const std::exception &e) {
00503 UTIL_THROW(util::Exception, "ExtractionPhrasePair"
00504 << ": Collecting phrase orientations failed. "
00505 << "Flawed property value in extract file?");
00506 }
00507
00508 int l2rOrientationClassId = -1;
00509 if (!l2rOrientationClass.compare("mono")) {
00510 l2rOrientationClassId = 0;
00511 }
00512 if (!l2rOrientationClass.compare("swap")) {
00513 l2rOrientationClassId = 1;
00514 }
00515 if (!l2rOrientationClass.compare("dleft")) {
00516 l2rOrientationClassId = 2;
00517 }
00518 if (!l2rOrientationClass.compare("dright")) {
00519 l2rOrientationClassId = 3;
00520 }
00521 if (l2rOrientationClassId == -1) {
00522 UTIL_THROW(util::Exception, "ExtractionPhrasePair"
00523 << ": Collecting phrase orientations failed. "
00524 << "Unknown orientation class \"" << l2rOrientationClass << "\"." );
00525 }
00526 int r2lOrientationClassId = -1;
00527 if (!r2lOrientationClass.compare("mono")) {
00528 r2lOrientationClassId = 0;
00529 }
00530 if (!r2lOrientationClass.compare("swap")) {
00531 r2lOrientationClassId = 1;
00532 }
00533 if (!r2lOrientationClass.compare("dleft")) {
00534 r2lOrientationClassId = 2;
00535 }
00536 if (!r2lOrientationClass.compare("dright")) {
00537 r2lOrientationClassId = 3;
00538 }
00539 if (r2lOrientationClassId == -1) {
00540 UTIL_THROW(util::Exception, "ExtractionPhrasePair"
00541 << ": Collecting phrase orientations failed. "
00542 << "Unknown orientation class \"" << r2lOrientationClass << "\"." );
00543 }
00544
00545 orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
00546 orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
00547 }
00548
00549 for (size_t i=0; i<4; ++i) {
00550 if (i>0) {
00551 out << " ";
00552 }
00553 out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
00554 }
00555 for (size_t i=0; i<4; ++i) {
00556 out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
00557 }
00558 }
00559
00560
00561 void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
00562 std::set<std::string>& vocabulary) const
00563 {
00564 const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
00565
00566 if ( allPropertyValues == NULL ) {
00567 return;
00568 }
00569
00570 for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
00571 iter!=allPropertyValues->end(); ++iter) {
00572
00573 std::vector<std::string> tokens = Moses::Tokenize(iter->first);
00574 for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
00575 tokenIt!=tokens.end(); ++tokenIt) {
00576 vocabulary.insert(*tokenIt);
00577 }
00578 }
00579 }
00580
00581
00582
00583 }
00584