00001
00002
00003
00004
00005
00006
00007
00008 #include <sstream>
00009 #include <algorithm>
00010 #include "Rule.h"
00011 #include "AlignedSentence.h"
00012 #include "ConsistentPhrase.h"
00013 #include "NonTerm.h"
00014 #include "Parameter.h"
00015
00016 using namespace std;
00017
00018 Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence)
00019 :m_lhs(lhsNonTerm)
00020 ,m_alignedSentence(alignedSentence)
00021 ,m_isValid(true)
00022 ,m_canRecurse(true)
00023 {
00024 CreateSource();
00025 }
00026
00027 Rule::Rule(const Rule ©, const NonTerm &nonTerm)
00028 :m_lhs(copy.m_lhs)
00029 ,m_alignedSentence(copy.m_alignedSentence)
00030 ,m_nonterms(copy.m_nonterms)
00031 ,m_isValid(true)
00032 ,m_canRecurse(true)
00033 {
00034 m_nonterms.push_back(&nonTerm);
00035 CreateSource();
00036
00037 }
00038
00039 Rule::~Rule()
00040 {
00041
00042 }
00043
00044 const ConsistentPhrase &Rule::GetConsistentPhrase() const
00045 {
00046 return m_lhs.GetConsistentPhrase();
00047 }
00048
00049 void Rule::CreateSource()
00050 {
00051 const NonTerm *cp = NULL;
00052 size_t nonTermInd = 0;
00053 if (nonTermInd < m_nonterms.size()) {
00054 cp = m_nonterms[nonTermInd];
00055 }
00056
00057 for (int sourcePos = m_lhs.GetConsistentPhrase().corners[0];
00058 sourcePos <= m_lhs.GetConsistentPhrase().corners[1];
00059 ++sourcePos) {
00060
00061 const RuleSymbol *ruleSymbol;
00062 if (cp && cp->GetConsistentPhrase().corners[0] <= sourcePos && sourcePos <= cp->GetConsistentPhrase().corners[1]) {
00063
00064 ruleSymbol = cp;
00065 sourcePos = cp->GetConsistentPhrase().corners[1];
00066 if (m_nonterms.size()) {
00067 cp = m_nonterms[nonTermInd];
00068 }
00069
00070
00071 ++nonTermInd;
00072 cp = (nonTermInd < m_nonterms.size()) ? m_nonterms[nonTermInd] : NULL;
00073 } else {
00074
00075 ruleSymbol = m_alignedSentence.GetPhrase(Moses::Input)[sourcePos];
00076 }
00077
00078 m_source.Add(ruleSymbol);
00079 }
00080 }
00081
00082 int Rule::GetNextSourcePosForNonTerm() const
00083 {
00084 if (m_nonterms.empty()) {
00085
00086 return m_lhs.GetConsistentPhrase().corners[0];
00087 } else {
00088
00089 const ConsistentPhrase &cp = m_nonterms.back()->GetConsistentPhrase();
00090 int nextPos = cp.corners[1] + 1;
00091 return nextPos;
00092 }
00093 }
00094
00095 std::string Rule::Debug() const
00096 {
00097 stringstream out;
00098
00099
00100 for (size_t i = 0; i < m_source.GetSize(); ++i) {
00101 const RuleSymbol &symbol = *m_source[i];
00102 out << symbol.Debug() << " ";
00103 }
00104
00105
00106 out << "||| ";
00107 for (size_t i = 0; i < m_target.GetSize(); ++i) {
00108 const RuleSymbol &symbol = *m_target[i];
00109 out << symbol.Debug() << " ";
00110 }
00111
00112 out << "||| ";
00113 Alignments::const_iterator iterAlign;
00114 for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
00115 const std::pair<int,int> &alignPair = *iterAlign;
00116 out << alignPair.first << "-" << alignPair.second << " ";
00117 }
00118
00119
00120 out << "||| LHS=" << m_lhs.Debug();
00121
00122 return out.str();
00123 }
00124
00125 void Rule::Output(std::ostream &out, bool forward) const
00126 {
00127 if (forward) {
00128
00129 m_source.Output(out);
00130 m_lhs.Output(out, Moses::Input);
00131
00132 out << " ||| ";
00133
00134
00135 m_target.Output(out);
00136 m_lhs.Output(out, Moses::Output);
00137 } else {
00138
00139 m_target.Output(out);
00140 m_lhs.Output(out, Moses::Output);
00141
00142 out << " ||| ";
00143
00144
00145 m_source.Output(out);
00146 m_lhs.Output(out, Moses::Input);
00147 }
00148
00149 out << " ||| ";
00150
00151
00152 Alignments::const_iterator iterAlign;
00153 for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) {
00154 const std::pair<int,int> &alignPair = *iterAlign;
00155
00156 if (forward) {
00157 out << alignPair.first << "-" << alignPair.second << " ";
00158 } else {
00159 out << alignPair.second << "-" << alignPair.first << " ";
00160 }
00161 }
00162
00163 out << "||| ";
00164
00165
00166 out << m_count;
00167
00168 out << " ||| ";
00169
00170
00171 if (forward) {
00172 for (size_t i = 0; i < m_properties.size(); ++i) {
00173 const Property &prop = m_properties[i];
00174 out << prop << " ";
00175 }
00176 }
00177 }
00178
00179 void Rule::NonTermContextFactor(int factor, const Word &word, std::ostream &out) const
00180 {
00181 out << word.GetString(factor) << " ";
00182 }
00183
00184 void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const
00185 {
00186 int startPos, endPos;
00187 const Phrase *phrase;
00188
00189 if (sourceTarget == 1) {
00190 startPos = cp.corners[0];
00191 endPos = cp.corners[1];
00192 phrase = &m_alignedSentence.GetPhrase(Moses::Input);
00193 } else if (sourceTarget == 2) {
00194 startPos = cp.corners[2];
00195 endPos = cp.corners[3];
00196 phrase = &m_alignedSentence.GetPhrase(Moses::Output);
00197 } else {
00198 abort();
00199 }
00200
00201 out << ntInd << " ";
00202
00203
00204 if (startPos == 0) {
00205 out << "<s> ";
00206 } else {
00207 NonTermContextFactor(factor, *phrase->at(startPos - 1), out);
00208 }
00209
00210
00211 NonTermContextFactor(factor, *phrase->at(startPos), out);
00212
00213
00214 NonTermContextFactor(factor, *phrase->at(endPos), out);
00215
00216
00217 if (endPos == phrase->size() - 1) {
00218 out << "</s> ";
00219 } else {
00220 NonTermContextFactor(factor, *phrase->at(endPos + 1), out);
00221 }
00222
00223
00224 }
00225
00226 void Rule::Prevalidate(const Parameter ¶ms)
00227 {
00228
00229 if (m_source.GetSize() > params.maxSymbolsSource) {
00230 m_isValid = false;
00231 }
00232
00233
00234 if (m_nonterms.size()) {
00235 const NonTerm &lastNonTerm = *m_nonterms.back();
00236 const ConsistentPhrase &cp = lastNonTerm.GetConsistentPhrase();
00237
00238 int sourceWidth = cp.GetWidth(Moses::Input);
00239 if (lastNonTerm.IsHiero(params)) {
00240 if (sourceWidth < params.minHoleSource) {
00241 m_isValid = false;
00242 m_canRecurse = false;
00243 return;
00244 }
00245 } else if (sourceWidth < params.minHoleSourceSyntax) {
00246 m_isValid = false;
00247 m_canRecurse = false;
00248 return;
00249 }
00250
00251 }
00252
00253
00254 int numNonTerms = 0;
00255 int numHieroNonTerms = 0;
00256 for (size_t i = 0; i < m_source.GetSize(); ++i) {
00257 const RuleSymbol *arc = m_source[i];
00258 if (arc->IsNonTerm()) {
00259 ++numNonTerms;
00260 const NonTerm &nonTerm = *static_cast<const NonTerm*>(arc);
00261 bool isHiero = nonTerm.IsHiero(params);
00262 if (isHiero) {
00263 ++numHieroNonTerms;
00264 }
00265 }
00266 }
00267
00268 if (numNonTerms >= params.maxNonTerm) {
00269 m_canRecurse = false;
00270 if (numNonTerms > params.maxNonTerm) {
00271 m_isValid = false;
00272 return;
00273 }
00274 }
00275
00276 if (numHieroNonTerms >= params.maxHieroNonTerm) {
00277 m_canRecurse = false;
00278 if (numHieroNonTerms > params.maxHieroNonTerm) {
00279 m_isValid = false;
00280 return;
00281 }
00282 }
00283
00284
00285 if (!params.nonTermConsecSource && m_nonterms.size() >= 2) {
00286 const NonTerm &lastNonTerm = *m_nonterms.back();
00287 const NonTerm &secondLastNonTerm = *m_nonterms[m_nonterms.size() - 2];
00288 if (secondLastNonTerm.GetConsistentPhrase().corners[1] + 1 ==
00289 lastNonTerm.GetConsistentPhrase().corners[0]) {
00290 if (params.mixedSyntaxType == 0) {
00291
00292 m_isValid = false;
00293 m_canRecurse = false;
00294 return;
00295 } else {
00296
00297 switch (params.nonTermConsecSourceMixedSyntax) {
00298 case 0:
00299 m_isValid = false;
00300 m_canRecurse = false;
00301 return;
00302 case 1:
00303 if (lastNonTerm.IsHiero(Moses::Input, params)
00304 && secondLastNonTerm.IsHiero(Moses::Input, params)) {
00305 m_isValid = false;
00306 m_canRecurse = false;
00307 return;
00308 }
00309 break;
00310 case 2:
00311 if (lastNonTerm.IsHiero(Moses::Input, params)
00312 || secondLastNonTerm.IsHiero(Moses::Input, params)) {
00313 m_isValid = false;
00314 m_canRecurse = false;
00315 return;
00316 }
00317 break;
00318 case 3:
00319 break;
00320 }
00321 }
00322 }
00323 }
00324
00325
00326 if (m_nonterms.size() >= 2) {
00327 const NonTerm &lastNonTerm = *m_nonterms.back();
00328
00329 for (size_t i = 0; i < m_nonterms.size() - 1; ++i) {
00330 const NonTerm &otherNonTerm = *m_nonterms[i];
00331 bool overlap = lastNonTerm.GetConsistentPhrase().TargetOverlap(otherNonTerm.GetConsistentPhrase());
00332
00333 if (overlap) {
00334 m_isValid = false;
00335 m_canRecurse = false;
00336 return;
00337 }
00338 }
00339 }
00340
00341
00342 if (params.requireAlignedWord) {
00343 bool ok = false;
00344 for (size_t i = 0; i < m_source.GetSize(); ++i) {
00345 const RuleSymbol &symbol = *m_source[i];
00346 if (!symbol.IsNonTerm()) {
00347 const Word &word = static_cast<const Word&>(symbol);
00348 if (word.GetAlignment().size()) {
00349 ok = true;
00350 break;
00351 }
00352 }
00353 }
00354
00355 if (!ok) {
00356 m_isValid = false;
00357 m_canRecurse = false;
00358 return;
00359 }
00360 }
00361
00362 if (params.maxSpanFreeNonTermSource) {
00363 const NonTerm *front = dynamic_cast<const NonTerm*>(m_source[0]);
00364 if (front) {
00365 int width = front->GetWidth(Moses::Input);
00366 if (width > params.maxSpanFreeNonTermSource) {
00367 m_isValid = false;
00368 m_canRecurse = false;
00369 return;
00370 }
00371 }
00372
00373 const NonTerm *back = dynamic_cast<const NonTerm*>(m_source.Back());
00374 if (back) {
00375 int width = back->GetWidth(Moses::Input);
00376 if (width > params.maxSpanFreeNonTermSource) {
00377 m_isValid = false;
00378 m_canRecurse = false;
00379 return;
00380 }
00381 }
00382 }
00383
00384 if (!params.nieceTerminal) {
00385
00386 std::set<const Word*> terms;
00387 for (size_t i = 0; i < m_source.GetSize(); ++i) {
00388 const Word *word = dynamic_cast<const Word*>(m_source[i]);
00389 if (word) {
00390 terms.insert(word);
00391 }
00392 }
00393
00394
00395 for (size_t i = 0; i < m_source.GetSize(); ++i) {
00396 const NonTerm *nonTerm = dynamic_cast<const NonTerm*>(m_source[i]);
00397 if (nonTerm) {
00398 const ConsistentPhrase &cp = nonTerm->GetConsistentPhrase();
00399 bool containTerm = ContainTerm(cp, terms);
00400
00401 if (containTerm) {
00402
00403
00404
00405
00406 m_isValid = false;
00407 m_canRecurse = false;
00408 return;
00409 }
00410 }
00411 }
00412 }
00413
00414 if (params.maxScope != UNDEFINED || params.minScope > 0) {
00415 int scope = GetScope(params);
00416 if (scope > params.maxScope) {
00417
00418
00419 m_isValid = false;
00420 m_canRecurse = false;
00421 return;
00422 }
00423
00424 if (scope < params.minScope) {
00425
00426
00427 m_isValid = false;
00428 }
00429 }
00430
00431
00432 if (params.scopeSpan.size()) {
00433 size_t scope = GetScope(params);
00434 if (scope >= params.scopeSpan.size()) {
00435
00436 } else {
00437 const std::pair<int,int> &constraint = params.scopeSpan[scope];
00438 int sourceWidth = m_lhs.GetWidth(Moses::Input);
00439 if (sourceWidth < constraint.first || sourceWidth > constraint.second) {
00440 m_isValid = false;
00441 m_canRecurse = false;
00442 return;
00443 }
00444 }
00445 }
00446 }
00447
00448 int Rule::GetScope(const Parameter ¶ms) const
00449 {
00450 size_t scope = 0;
00451 bool previousIsAmbiguous = false;
00452
00453 if (m_source[0]->IsNonTerm()) {
00454 scope++;
00455 previousIsAmbiguous = true;
00456 }
00457
00458 for (size_t i = 1; i < m_source.GetSize(); ++i) {
00459 const RuleSymbol *symbol = m_source[i];
00460 bool isAmbiguous = symbol->IsNonTerm();
00461 if (isAmbiguous) {
00462
00463 const NonTerm *nt = static_cast<const NonTerm*>(symbol);
00464 isAmbiguous = nt->IsHiero(Moses::Input, params);
00465 }
00466
00467 if (isAmbiguous && previousIsAmbiguous) {
00468 scope++;
00469 }
00470 previousIsAmbiguous = isAmbiguous;
00471 }
00472
00473 if (previousIsAmbiguous) {
00474 scope++;
00475 }
00476
00477 return scope;
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494 }
00495
00496 template<typename T>
00497 bool Contains(const T *sought, const set<const T*> &coll)
00498 {
00499 std::set<const Word*>::const_iterator iter;
00500 for (iter = coll.begin(); iter != coll.end(); ++iter) {
00501 const Word *found = *iter;
00502 if (sought->CompareString(*found) == 0) {
00503 return true;
00504 }
00505 }
00506 return false;
00507 }
00508
00509 bool Rule::ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const
00510 {
00511 const Phrase &sourceSentence = m_alignedSentence.GetPhrase(Moses::Input);
00512
00513 for (int pos = cp.corners[0]; pos <= cp.corners[1]; ++pos) {
00514 const Word *soughtWord = sourceSentence[pos];
00515
00516
00517 if (Contains(soughtWord, terms)) {
00518 return true;
00519 }
00520 }
00521 return false;
00522 }
00523
00524 bool CompareTargetNonTerms(const NonTerm *a, const NonTerm *b)
00525 {
00526
00527 return a->GetConsistentPhrase().corners[2] < b->GetConsistentPhrase().corners[2];
00528 }
00529
00530 void Rule::CreateTarget(const Parameter ¶ms)
00531 {
00532 if (!m_isValid) {
00533 return;
00534 }
00535
00536 vector<const NonTerm*> targetNonTerm(m_nonterms);
00537 std::sort(targetNonTerm.begin(), targetNonTerm.end(), CompareTargetNonTerms);
00538
00539 const NonTerm *cp = NULL;
00540 size_t nonTermInd = 0;
00541 if (nonTermInd < targetNonTerm.size()) {
00542 cp = targetNonTerm[nonTermInd];
00543 }
00544
00545 for (int targetPos = m_lhs.GetConsistentPhrase().corners[2];
00546 targetPos <= m_lhs.GetConsistentPhrase().corners[3];
00547 ++targetPos) {
00548
00549 const RuleSymbol *ruleSymbol;
00550 if (cp && cp->GetConsistentPhrase().corners[2] <= targetPos && targetPos <= cp->GetConsistentPhrase().corners[3]) {
00551
00552 ruleSymbol = cp;
00553 targetPos = cp->GetConsistentPhrase().corners[3];
00554 if (targetNonTerm.size()) {
00555 cp = targetNonTerm[nonTermInd];
00556 }
00557
00558
00559 ++nonTermInd;
00560 cp = (nonTermInd < targetNonTerm.size()) ? targetNonTerm[nonTermInd] : NULL;
00561 } else {
00562
00563 ruleSymbol = m_alignedSentence.GetPhrase(Moses::Output)[targetPos];
00564 }
00565
00566 m_target.Add(ruleSymbol);
00567 }
00568
00569 CreateAlignments();
00570 }
00571
00572
00573 void Rule::CreateAlignments()
00574 {
00575 for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) {
00576 const RuleSymbol *symbol = m_source[sourcePos];
00577 if (!symbol->IsNonTerm()) {
00578
00579 const Word &sourceWord = static_cast<const Word&>(*symbol);
00580 const std::set<const Word *> &targetWords = sourceWord.GetAlignment();
00581 CreateAlignments(sourcePos, targetWords);
00582 } else {
00583
00584 CreateAlignments(sourcePos, symbol);
00585 }
00586 }
00587 }
00588
00589 void Rule::CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords)
00590 {
00591 std::set<const Word *>::const_iterator iterTarget;
00592 for (iterTarget = targetWords.begin(); iterTarget != targetWords.end(); ++iterTarget) {
00593 const Word *targetWord = *iterTarget;
00594 CreateAlignments(sourcePos, targetWord);
00595 }
00596 }
00597
00598 void Rule::CreateAlignments(int sourcePos, const RuleSymbol *targetSought)
00599 {
00600
00601 for (size_t targetPos = 0; targetPos < m_target.GetSize(); ++targetPos) {
00602 const RuleSymbol *foundSymbol = m_target[targetPos];
00603 if (targetSought == foundSymbol) {
00604 pair<int, int> alignPoint(sourcePos, targetPos);
00605 m_alignments.insert(alignPoint);
00606 return;
00607 }
00608 }
00609
00610 throw "not found";
00611 }
00612
00613 void Rule::CreateProperties(const Parameter ¶ms)
00614 {
00615
00616
00617
00618 if (params.spanLength && m_nonterms.size()) {
00619 stringstream strme;
00620 strme << "{{SpanLength ";
00621
00622 for (size_t i = 0; i < m_nonterms.size(); ++i) {
00623 const NonTerm &nonTerm = *m_nonterms[i];
00624 const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
00625 strme << i << "," << cp.GetWidth(Moses::Input) << "," << cp.GetWidth(Moses::Output) << " ";
00626 }
00627 strme << "}}";
00628
00629 m_properties.push_back(strme.str());
00630 }
00631
00632 if (params.ruleLength && m_nonterms.size()) {
00633 const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase();
00634
00635 stringstream strme;
00636 strme << "{{RuleLength ";
00637 strme << cp.GetWidth(Moses::Input);
00638 strme << "}}";
00639
00640 m_properties.push_back(strme.str());
00641 }
00642
00643
00644 if (params.nonTermContext && m_nonterms.size()) {
00645 stringstream strme;
00646 strme << "{{NonTermContext ";
00647
00648 int factor = params.nonTermContextFactor;
00649
00650 for (size_t i = 0; i < m_nonterms.size(); ++i) {
00651 const NonTerm &nonTerm = *m_nonterms[i];
00652 const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
00653 NonTermContext(1, factor, i, cp, strme);
00654 }
00655 strme << "}}";
00656
00657 m_properties.push_back(strme.str());
00658 }
00659
00660
00661 if (params.nonTermContextTarget && m_nonterms.size()) {
00662 stringstream strme;
00663 strme << "{{NonTermContextTarget ";
00664
00665 int factor = params.nonTermContextFactor;
00666
00667 for (size_t i = 0; i < m_nonterms.size(); ++i) {
00668 const NonTerm &nonTerm = *m_nonterms[i];
00669 const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase();
00670 NonTermContext(2, factor, i, cp, strme);
00671 }
00672 strme << "}}";
00673
00674 m_properties.push_back(strme.str());
00675 }
00676
00677 }
00678
00679