00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <algorithm>
00021 #include <assert.h>
00022 #include <cstdio>
00023 #include <cstring>
00024 #include <fstream>
00025 #include <iostream>
00026 #include <list>
00027 #include <map>
00028 #include <set>
00029 #include <sstream>
00030 #include <string>
00031 #include <vector>
00032 #include <limits>
00033
00034 #ifdef WIN32
00035
00036
00037 #endif
00038
00039 #include "ExtractedRule.h"
00040 #include "Hole.h"
00041 #include "HoleCollection.h"
00042 #include "RuleExist.h"
00043 #include "SentenceAlignmentWithSyntax.h"
00044 #include "SyntaxNode.h"
00045 #include "tables-core.h"
00046 #include "XmlTree.h"
00047 #include "InputFileStream.h"
00048 #include "OutputFileStream.h"
00049 #include "PhraseOrientation.h"
00050
00051 using namespace std;
00052 using namespace MosesTraining;
00053
00054 typedef vector< int > LabelIndex;
00055 typedef map< int, int > WordIndex;
00056
00057 class ExtractTask
00058 {
00059 private:
00060 SentenceAlignmentWithSyntax &m_sentence;
00061 const RuleExtractionOptions &m_options;
00062 Moses::OutputFileStream& m_extractFile;
00063 Moses::OutputFileStream& m_extractFileInv;
00064 Moses::OutputFileStream& m_extractFileContext;
00065 Moses::OutputFileStream& m_extractFileContextInv;
00066 PhraseOrientation m_phraseOrientation;
00067
00068 vector< ExtractedRule > m_extractedRules;
00069
00070
00071 void extractRules();
00072 void addRuleToCollection(ExtractedRule &rule);
00073 void consolidateRules();
00074 void writeRulesToFile();
00075
00076
00077 void addRule( int, int, int, int, int, RuleExist &ruleExist);
00078 void addHieroRule( int startT, int endT, int startS, int endS
00079 , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
00080 void saveHieroPhrase( int startT, int endT, int startS, int endS
00081 , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
00082 string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
00083 , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
00084 string saveSourceHieroPhrase( int startT, int endT, int startS, int endS
00085 , HoleCollection &holeColl, const LabelIndex &labelIndex);
00086 void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
00087 , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
00088 void saveHieroAlignment( int startT, int endT, int startS, int endS
00089 , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
00090 void saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule);
00091 void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
00092
00093 inline string IntToString( int i ) {
00094 stringstream out;
00095 out << i;
00096 return out.str();
00097 }
00098
00099 public:
00100 ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv, Moses::OutputFileStream &extractFileContext, Moses::OutputFileStream &extractFileContextInv):
00101 m_sentence(sentence),
00102 m_options(options),
00103 m_extractFile(extractFile),
00104 m_extractFileInv(extractFileInv),
00105 m_extractFileContext(extractFileContext),
00106 m_extractFileContextInv(extractFileContextInv) {}
00107 void Run();
00108
00109 };
00110
00111
00112 void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
00113 void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
00114 void writeUnknownWordLabel(const string &);
00115 void writePhraseOrientationPriors(const string &);
00116
00117 double getPcfgScore(const SyntaxNode &);
00118
00119
00120 int main(int argc, char* argv[])
00121 {
00122 cerr << "extract-rules, written by Philipp Koehn\n"
00123 << "rule extraction from an aligned parallel corpus\n";
00124
00125 RuleExtractionOptions options;
00126 int sentenceOffset = 0;
00127 #ifdef WITH_THREADS
00128 int thread_count = 1;
00129 #endif
00130 if (argc < 5) {
00131 cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
00132
00133 << " --GlueGrammar FILE"
00134 << " | --UnknownWordLabel FILE"
00135 << " | --OnlyDirect"
00136 << " | --MaxSpan[" << options.maxSpan << "]"
00137 << " | --MinHoleTarget[" << options.minHoleTarget << "]"
00138 << " | --MinHoleSource[" << options.minHoleSource << "]"
00139 << " | --MinWords[" << options.minWords << "]"
00140 << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
00141 << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
00142 << " | --MaxNonTerm[" << options.maxNonTerm << "]"
00143 << " | --MaxScope[" << options.maxScope << "]"
00144 << " | --SourceSyntax | --TargetSyntax"
00145 << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
00146 << " | --UnpairedExtractFormat"
00147 << " | --ConditionOnTargetLHS ]"
00148 << " | --BoundaryRules[" << options.boundaryRules << "]"
00149 << " | --FlexibilityScore"
00150 << " | --PhraseOrientation\n";
00151
00152 exit(1);
00153 }
00154 char* &fileNameT = argv[1];
00155 char* &fileNameS = argv[2];
00156 char* &fileNameA = argv[3];
00157 string fileNameGlueGrammar;
00158 string fileNameUnknownWordLabel;
00159 string fileNameExtract = string(argv[4]);
00160
00161 int optionInd = 5;
00162
00163 for(int i=optionInd; i<argc; i++) {
00164
00165 if (strcmp(argv[i],"--MaxSpan") == 0) {
00166 options.maxSpan = atoi(argv[++i]);
00167 if (options.maxSpan < 1) {
00168 cerr << "extract error: --maxSpan should be at least 1" << endl;
00169 exit(1);
00170 }
00171 } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
00172 options.minHoleTarget = atoi(argv[++i]);
00173 if (options.minHoleTarget < 1) {
00174 cerr << "extract error: --minHoleTarget should be at least 1" << endl;
00175 exit(1);
00176 }
00177 } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
00178 options.minHoleSource = atoi(argv[++i]);
00179 if (options.minHoleSource < 1) {
00180 cerr << "extract error: --minHoleSource should be at least 1" << endl;
00181 exit(1);
00182 }
00183 }
00184
00185 else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
00186 options.maxSymbolsTarget = atoi(argv[++i]);
00187 if (options.maxSymbolsTarget < 1) {
00188 cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
00189 exit(1);
00190 }
00191 }
00192
00193 else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
00194 options.maxSymbolsSource = atoi(argv[++i]);
00195 if (options.maxSymbolsSource < 1) {
00196 cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
00197 exit(1);
00198 }
00199 }
00200
00201 else if (strcmp(argv[i],"--MinWords") == 0) {
00202 options.minWords = atoi(argv[++i]);
00203 if (options.minWords < 0) {
00204 cerr << "extract error: --MinWords should be at least 0" << endl;
00205 exit(1);
00206 }
00207 }
00208
00209 else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
00210 options.maxNonTerm = atoi(argv[++i]);
00211 if (options.maxNonTerm < 1) {
00212 cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
00213 exit(1);
00214 }
00215 }
00216
00217 else if (strcmp(argv[i],"--MaxScope") == 0) {
00218 options.maxScope = atoi(argv[++i]);
00219 if (options.maxScope < 0) {
00220 cerr << "extract error: --MaxScope should be at least 0" << endl;
00221 exit(1);
00222 }
00223 } else if (strcmp(argv[i], "--GZOutput") == 0) {
00224 options.gzOutput = true;
00225 }
00226
00227 else if (strcmp(argv[i],"--TargetSyntax") == 0) {
00228 options.targetSyntax = true;
00229 } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
00230 options.targetSyntacticPreferences = true;
00231 } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
00232 options.sourceSyntax = true;
00233 } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
00234 options.requireAlignedWord = false;
00235 } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
00236 options.nonTermConsecTarget = false;
00237 } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
00238 options.nonTermConsecSource = true;
00239 } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
00240 options.nonTermFirstWord = false;
00241 } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
00242 options.onlyOutputSpanInfo = true;
00243 } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
00244 options.onlyDirectFlag = true;
00245 } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
00246 options.glueGrammarFlag = true;
00247 if (++i >= argc) {
00248 cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
00249 exit(0);
00250 }
00251 fileNameGlueGrammar = string(argv[i]);
00252 cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
00253 } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
00254 options.unknownWordLabelFlag = true;
00255 if (++i >= argc) {
00256 cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
00257 exit(0);
00258 }
00259 fileNameUnknownWordLabel = string(argv[i]);
00260 cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
00261 }
00262
00263
00264
00265
00266
00267 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
00268 options.fractionalCounting = false;
00269 } else if (strcmp(argv[i],"--PCFG") == 0) {
00270 options.pcfgScore = true;
00271 } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
00272 options.unpairedExtractFormat = true;
00273 } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
00274 options.conditionOnTargetLhs = true;
00275 } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
00276 options.flexScoreFlag = true;
00277 } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
00278 options.phraseOrientation = true;
00279 } else if (strcmp(argv[i],"-threads") == 0 ||
00280 strcmp(argv[i],"--threads") == 0 ||
00281 strcmp(argv[i],"--Threads") == 0) {
00282 #ifdef WITH_THREADS
00283 thread_count = atoi(argv[++i]);
00284 #else
00285 cerr << "thread support not compiled in." << '\n';
00286 exit(1);
00287 #endif
00288 } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
00289 if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
00290 cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
00291 exit(1);
00292 }
00293 sentenceOffset = atoi(argv[++i]);
00294 } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
00295 options.boundaryRules = true;
00296 } else {
00297 cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
00298 exit(1);
00299 }
00300 }
00301
00302 cerr << "extracting hierarchical rules" << endl;
00303
00304
00305 Moses::InputFileStream tFile(fileNameT);
00306 Moses::InputFileStream sFile(fileNameS);
00307 Moses::InputFileStream aFile(fileNameA);
00308
00309 istream *tFileP = &tFile;
00310 istream *sFileP = &sFile;
00311 istream *aFileP = &aFile;
00312
00313
00314 string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
00315 Moses::OutputFileStream extractFile;
00316 Moses::OutputFileStream extractFileInv;
00317 Moses::OutputFileStream extractFileContext;
00318 Moses::OutputFileStream extractFileContextInv;
00319 extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str());
00320 if (!options.onlyDirectFlag)
00321 extractFileInv.Open(fileNameExtractInv.c_str());
00322
00323 if (options.flexScoreFlag) {
00324 string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":"");
00325 extractFileContext.Open(fileNameExtractContext.c_str());
00326 if (!options.onlyDirectFlag) {
00327 string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":"");
00328 extractFileContextInv.Open(fileNameExtractContextInv.c_str());
00329 }
00330 }
00331
00332
00333 set< string > targetLabelCollection, sourceLabelCollection;
00334 map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
00335
00336
00337 size_t i=sentenceOffset;
00338 string targetString, sourceString, alignmentString;
00339
00340 while(getline(*tFileP, targetString)) {
00341 i++;
00342
00343 getline(*sFileP, sourceString);
00344 getline(*aFileP, alignmentString);
00345
00346 if (i%1000 == 0) cerr << i << " " << flush;
00347
00348 SentenceAlignmentWithSyntax sentence
00349 (targetLabelCollection, sourceLabelCollection,
00350 targetTopLabelCollection, sourceTopLabelCollection,
00351 options.targetSyntax, options.sourceSyntax);
00352
00353 if (options.onlyOutputSpanInfo) {
00354 cout << "LOG: SRC: " << sourceString << endl;
00355 cout << "LOG: TGT: " << targetString << endl;
00356 cout << "LOG: ALT: " << alignmentString << endl;
00357 cout << "LOG: PHRASES_BEGIN:" << endl;
00358 }
00359
00360 if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
00361 if (options.unknownWordLabelFlag) {
00362 collectWordLabelCounts(sentence);
00363 }
00364 ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv);
00365 task->Run();
00366 delete task;
00367 }
00368 if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl;
00369 }
00370
00371 tFile.Close();
00372 sFile.Close();
00373 aFile.Close();
00374
00375 if (!options.onlyOutputSpanInfo) {
00376 extractFile.Close();
00377 if (!options.onlyDirectFlag) extractFileInv.Close();
00378 }
00379
00380 if (options.flexScoreFlag) {
00381 extractFileContext.Close();
00382 if (!options.onlyDirectFlag) extractFileContextInv.Close();
00383 }
00384
00385 if (options.glueGrammarFlag)
00386 writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);
00387
00388 if (options.unknownWordLabelFlag)
00389 writeUnknownWordLabel(fileNameUnknownWordLabel);
00390
00391 if (options.phraseOrientation) {
00392 std::string fileNamePhraseOrientationPriors = fileNameExtract + string(".phraseOrientationPriors");
00393 writePhraseOrientationPriors(fileNamePhraseOrientationPriors);
00394 }
00395 }
00396
00397 void ExtractTask::Run()
00398 {
00399 extractRules();
00400 consolidateRules();
00401 writeRulesToFile();
00402 m_extractedRules.clear();
00403 }
00404
00405 void ExtractTask::extractRules()
00406 {
00407 int countT = m_sentence.target.size();
00408 int countS = m_sentence.source.size();
00409
00410
00411 if (m_options.phraseOrientation) {
00412 m_sentence.invertAlignment();
00413 m_phraseOrientation = PhraseOrientation(countS, countT, m_sentence.alignedToT, m_sentence.alignedToS, m_sentence.alignedCountS);
00414 }
00415
00416
00417 RuleExist ruleExist(countT);
00418
00419
00420 for(int lengthT=1;
00421 lengthT <= m_options.maxSpan && lengthT <= countT;
00422 lengthT++) {
00423 for(int startT=0; startT < countT-(lengthT-1); startT++) {
00424
00425
00426 int endT = startT + lengthT - 1;
00427
00428
00429 if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
00430
00431 continue;
00432
00433
00434
00435 int minS = std::numeric_limits<int>::max();
00436 int maxS = -1;
00437 vector< int > usedS = m_sentence.alignedCountS;
00438 for(int ti=startT; ti<=endT; ti++) {
00439 for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
00440 int si = m_sentence.alignedToT[ti][i];
00441 if (si<minS) {
00442 minS = si;
00443 }
00444 if (si>maxS) {
00445 maxS = si;
00446 }
00447 usedS[ si ]--;
00448 }
00449 }
00450
00451
00452 if( maxS == -1 )
00453 continue;
00454
00455
00456 if( maxS-minS >= m_options.maxSpan )
00457 continue;
00458
00459
00460 bool out_of_bounds = false;
00461 for(int si=minS; si<=maxS && !out_of_bounds; si++)
00462 if (usedS[si]>0) {
00463 out_of_bounds = true;
00464 }
00465
00466
00467 if (out_of_bounds)
00468 continue;
00469
00470
00471
00472 for(int startS=minS;
00473 (startS>=0 &&
00474 startS>maxS - m_options.maxSpan &&
00475 (startS==minS || m_sentence.alignedCountS[startS]==0));
00476 startS--) {
00477
00478 for(int endS=maxS;
00479 (endS<countS && endS<startS + m_options.maxSpan &&
00480 (endS==maxS || m_sentence.alignedCountS[endS]==0));
00481 endS++) {
00482
00483 if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
00484 continue;
00485
00486
00487
00488
00489 if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
00490 addRule(startT,endT,startS,endS, countS, ruleExist);
00491 }
00492
00493
00494 ruleExist.Add(startT, endT, startS, endS);
00495
00496
00497
00498
00499 int initStartT = m_options.nonTermFirstWord ? startT : startT + 1;
00500
00501 HoleCollection holeColl(startS, endS);
00502 addHieroRule(startT, endT, startS, endS,
00503 ruleExist, holeColl, 0, initStartT,
00504 endT-startT+1, endS-startS+1);
00505 }
00506 }
00507 }
00508 }
00509 }
00510
00511 void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
00512 , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
00513 {
00514 vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
00515 assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
00516
00517 int outPos = 0;
00518 int holeCount = 0;
00519 int holeTotal = holeColl.GetHoles().size();
00520 for(int currPos = startS; currPos <= endS; currPos++) {
00521 bool isHole = false;
00522 if (iterHoleList != holeColl.GetSortedSourceHoles().end()) {
00523 const Hole &hole = **iterHoleList;
00524 isHole = hole.GetStart(0) == currPos;
00525 }
00526
00527 if (isHole) {
00528 Hole &hole = **iterHoleList;
00529
00530 int labelI = labelIndex[ 2+holeCount+holeTotal ];
00531 string label = m_options.sourceSyntax ?
00532 m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
00533 hole.SetLabel(label, 0);
00534
00535 currPos = hole.GetEnd(0);
00536 hole.SetPos(outPos, 0);
00537 ++iterHoleList;
00538 ++holeCount;
00539 } else {
00540 indexS[currPos] = outPos;
00541 }
00542
00543 outPos++;
00544 }
00545
00546 assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
00547 }
00548
00549 string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int endS
00550 , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
00551 , int countS)
00552 {
00553 HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
00554 assert(iterHoleList != holeColl.GetHoles().end());
00555
00556 string out = "";
00557 int outPos = 0;
00558 int holeCount = 0;
00559 for(int currPos = startT; currPos <= endT; currPos++) {
00560 bool isHole = false;
00561 if (iterHoleList != holeColl.GetHoles().end()) {
00562 const Hole &hole = *iterHoleList;
00563 isHole = hole.GetStart(1) == currPos;
00564 }
00565
00566 if (isHole) {
00567 Hole &hole = *iterHoleList;
00568
00569 const string &sourceLabel = hole.GetLabel(0);
00570 assert(sourceLabel != "");
00571
00572 int labelI = labelIndex[ 2+holeCount ];
00573 string targetLabel;
00574 if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
00575 targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
00576 } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
00577 targetLabel = "S";
00578 } else {
00579 targetLabel = "X";
00580 }
00581
00582 hole.SetLabel(targetLabel, 1);
00583
00584 if (m_options.unpairedExtractFormat) {
00585 out += "[" + targetLabel + "] ";
00586 } else {
00587 out += "[" + sourceLabel + "][" + targetLabel + "] ";
00588 }
00589
00590 if (m_options.pcfgScore) {
00591 logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
00592 }
00593
00594 currPos = hole.GetEnd(1);
00595 hole.SetPos(outPos, 1);
00596 ++iterHoleList;
00597 holeCount++;
00598 } else {
00599 indexT[currPos] = outPos;
00600 out += m_sentence.target[currPos] + " ";
00601 }
00602
00603 outPos++;
00604 }
00605
00606 assert(iterHoleList == holeColl.GetHoles().end());
00607 return out.erase(out.size()-1);
00608 }
00609
00610 string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int endS
00611 , HoleCollection &holeColl, const LabelIndex &labelIndex)
00612 {
00613 vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
00614 assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
00615
00616 string out = "";
00617 int outPos = 0;
00618 int holeCount = 0;
00619 for(int currPos = startS; currPos <= endS; currPos++) {
00620 bool isHole = false;
00621 if (iterHoleList != holeColl.GetSortedSourceHoles().end()) {
00622 const Hole &hole = **iterHoleList;
00623 isHole = hole.GetStart(0) == currPos;
00624 }
00625
00626 if (isHole) {
00627 Hole &hole = **iterHoleList;
00628
00629 const string &targetLabel = hole.GetLabel(1);
00630 assert(targetLabel != "");
00631
00632 const string &sourceLabel = hole.GetLabel(0);
00633 if (m_options.unpairedExtractFormat) {
00634 out += "[" + sourceLabel + "] ";
00635 } else {
00636 out += "[" + sourceLabel + "][" + (m_options.targetSyntacticPreferences ? "X" : targetLabel) + "] ";
00637 }
00638
00639 currPos = hole.GetEnd(0);
00640 hole.SetPos(outPos, 0);
00641 ++iterHoleList;
00642 ++holeCount;
00643 } else {
00644 out += m_sentence.source[currPos] + " ";
00645 }
00646
00647 outPos++;
00648 }
00649
00650 assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
00651 return out.erase(out.size()-1);
00652 }
00653
00654 void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
00655 , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
00656 {
00657
00658 for(int ti=startT; ti<=endT; ti++) {
00659 WordIndex::const_iterator p = indexT.find(ti);
00660 if (p != indexT.end()) {
00661 for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
00662 int si = m_sentence.alignedToT[ti][i];
00663 std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
00664 std::string targetSymbolIndex = IntToString(p->second);
00665 rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
00666 if (! m_options.onlyDirectFlag)
00667 rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
00668 }
00669 }
00670 }
00671
00672
00673 HoleList::const_iterator iterHole;
00674 for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
00675 const Hole &hole = *iterHole;
00676
00677 std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
00678 std::string targetSymbolIndex = IntToString(hole.GetPos(1));
00679 rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
00680 if (!m_options.onlyDirectFlag)
00681 rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
00682 }
00683
00684 rule.alignment.erase(rule.alignment.size()-1);
00685 if (!m_options.onlyDirectFlag) {
00686 rule.alignmentInv.erase(rule.alignmentInv.size()-1);
00687 }
00688 }
00689
00690 void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule)
00691 {
00692 rule.targetSyntacticPreference = "";
00693 int holeCount = 0;
00694 for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
00695 iterHoleList != holeColl.GetHoles().end();
00696 ++iterHoleList) {
00697
00698 const Hole &hole = *iterHoleList;
00699
00700 int labelI = labelIndex[ 2+holeCount ];
00701 int startT = hole.GetStart(1);
00702 int endT = hole.GetEnd(1);
00703 if (m_sentence.targetTree.HasNode(startT,endT)) {
00704 rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
00705 rule.targetSyntacticPreference += " ";
00706 } else {
00707 rule.targetSyntacticPreference += "XRHS ";
00708 }
00709 ++holeCount;
00710 }
00711
00712 rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1);
00713 }
00714
00715
00716 void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
00717 , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
00718 {
00719 WordIndex indexS, indexT;
00720
00721 ExtractedRule rule( startT, endT, startS, endS );
00722
00723
00724 string targetLabel;
00725
00726 if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
00727 targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
00728 } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
00729 targetLabel = "S";
00730 } else {
00731 targetLabel = "X";
00732 }
00733
00734 string sourceLabel = m_options.sourceSyntax ?
00735 m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
00736
00737
00738 preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
00739
00740
00741 if (m_options.pcfgScore) {
00742 double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
00743 rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
00744 + " [" + targetLabel + "]";
00745 rule.pcfgScore = std::exp(logPCFGScore);
00746 } else {
00747 double logPCFGScore = 0.0f;
00748 rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
00749 + " [" + targetLabel + "]";
00750 }
00751
00752
00753 rule.source = saveSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex);
00754 if (m_options.conditionOnTargetLhs) {
00755 rule.source += " [" + targetLabel + "]";
00756 } else {
00757 rule.source += " [" + sourceLabel + "]";
00758 }
00759
00760
00761 saveHieroAlignment(startT, endT, startS, endS, indexS, indexT, holeColl, rule);
00762
00763
00764 if (m_options.flexScoreFlag) {
00765 rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1];
00766 rule.sourceContextRight = endS+1 == m_sentence.source.size() ? "<s>" : m_sentence.source[endS+1];
00767 rule.targetContextLeft = startT == 0 ? "<s>" : m_sentence.target[startT-1];
00768 rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1];
00769 rule.sourceHoleString = "";
00770 rule.targetHoleString = "";
00771
00772 HoleList::const_iterator iterHole;
00773 for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
00774 const Hole &hole = *iterHole;
00775 rule.sourceHoleString += hole.GetLabel(0) + ": ";
00776
00777
00778 if (hole.GetStart(0) == startS) {
00779 rule.sourceContextLeft = m_sentence.source[hole.GetEnd(0)];
00780 }
00781
00782 else if (hole.GetEnd(0) == endS) {
00783 rule.sourceContextRight = m_sentence.source[hole.GetStart(0)];
00784 }
00785
00786 if (hole.GetStart(1) == startT) {
00787 rule.targetContextLeft = m_sentence.target[hole.GetEnd(1)];
00788 } else if (hole.GetEnd(1) == endT) {
00789 rule.targetContextRight = m_sentence.target[hole.GetStart(1)];
00790 }
00791
00792 for (int i = hole.GetStart(0); i <= hole.GetEnd(0); ++i) {
00793 rule.sourceHoleString += m_sentence.source[i] + " ";
00794 }
00795 rule.targetHoleString += hole.GetLabel(1) + ": ";
00796 for (int i = hole.GetStart(1); i <= hole.GetEnd(1); ++i) {
00797 rule.targetHoleString += m_sentence.target[i] + " ";
00798 }
00799 }
00800 }
00801
00802
00803 if (m_options.phraseOrientation) {
00804 rule.l2rOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_L2R);
00805 rule.r2lOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L);
00806
00807
00808
00809 }
00810
00811
00812 if (m_options.targetSyntacticPreferences) {
00813 saveTargetSyntacticPreference(holeColl, labelIndex, rule);
00814 if (m_sentence.targetTree.HasNode(startT,endT)) {
00815 rule.targetSyntacticPreference += " ";
00816 rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
00817 } else {
00818 rule.targetSyntacticPreference += " XLHS";
00819 }
00820 }
00821
00822 addRuleToCollection( rule );
00823 }
00824
00825 void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
00826 {
00827 LabelIndex labelIndex,labelCount;
00828
00829
00830 int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
00831 if (m_options.targetSyntacticPreferences && !numLabels) {
00832 numLabels++;
00833 }
00834 labelCount.push_back(numLabels);
00835 labelIndex.push_back(0);
00836
00837
00838 numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
00839 labelCount.push_back(numLabels);
00840 labelIndex.push_back(0);
00841
00842
00843 for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
00844 hole != holeColl.GetHoles().end(); hole++ ) {
00845 int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
00846 if (m_options.targetSyntacticPreferences && !numLabels) {
00847 numLabels++;
00848 }
00849 labelCount.push_back(numLabels);
00850 labelIndex.push_back(0);
00851 }
00852
00853
00854 holeColl.SortSourceHoles();
00855 for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
00856 i != holeColl.GetSortedSourceHoles().end(); i++ ) {
00857 const Hole &hole = **i;
00858 int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
00859 labelCount.push_back(numLabels);
00860 labelIndex.push_back(0);
00861 }
00862
00863
00864 bool done = false;
00865 while(!done) {
00866 saveHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
00867 for(unsigned int i=0; i<labelIndex.size(); i++) {
00868 labelIndex[i]++;
00869 if(labelIndex[i] == labelCount[i]) {
00870 labelIndex[i] = 0;
00871 if (i == labelIndex.size()-1)
00872 done = true;
00873 } else {
00874 break;
00875 }
00876 }
00877 }
00878 }
00879
00880
00881
00882 void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
00883 , RuleExist &ruleExist, HoleCollection &holeColl
00884 , int numHoles, int initStartT, int wordCountT, int wordCountS)
00885 {
00886
00887 if (numHoles >= m_options.maxNonTerm)
00888 return;
00889
00890
00891 for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) {
00892 for (int endHoleT = startHoleT+(m_options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) {
00893
00894 if (numHoles == m_options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > m_options.maxSymbolsTarget)
00895 continue;
00896
00897
00898 const int newWordCountT = wordCountT - (endHoleT-startHoleT+1);
00899
00900
00901 if (newWordCountT < m_options.minWords)
00902 continue;
00903
00904
00905 if (startHoleT == startT && endHoleT == endT)
00906 continue;
00907
00908
00909
00910
00911 const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT);
00912
00913
00914 HoleList::const_iterator iterSourceHoles;
00915 for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) {
00916 const Hole &sourceHole = *iterSourceHoles;
00917
00918 const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1;
00919
00920
00921 if (sourceHoleSize < m_options.minHoleSource)
00922 continue;
00923
00924
00925 const int newWordCountS = wordCountS - sourceHoleSize;
00926
00927
00928 if (numHoles == m_options.maxNonTerm-1 && newWordCountS + (numHoles+1) > m_options.maxSymbolsSource)
00929 continue;
00930
00931
00932 if (newWordCountS < m_options.minWords)
00933 continue;
00934
00935
00936
00937 if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0))
00938 continue;
00939
00940
00941 if (holeColl.OverlapSource(sourceHole))
00942 continue;
00943
00944
00945 if (!m_options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) )
00946 continue;
00947
00948
00949
00950 if (holeColl.Scope(sourceHole) > m_options.maxScope)
00951 continue;
00952
00953
00954 if (m_options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) {
00955 HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
00956 bool foundAlignedWord = false;
00957
00958 for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) {
00959
00960 if (pos == startHoleT) {
00961 pos = endHoleT;
00962 }
00963
00964 else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) {
00965 pos = iterHoleList->GetEnd(1);
00966 ++iterHoleList;
00967 }
00968
00969 else {
00970 if (m_sentence.alignedToT[pos].size() > 0)
00971 foundAlignedWord = true;
00972 }
00973 }
00974 if (!foundAlignedWord)
00975 continue;
00976 }
00977
00978
00979 holeColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0));
00980
00981 bool allowablePhrase = true;
00982
00983
00984 if (newWordCountS + (numHoles+1) > m_options.maxSymbolsSource)
00985 allowablePhrase = false;
00986
00987 if (newWordCountT + (numHoles+1) > m_options.maxSymbolsTarget)
00988 allowablePhrase = false;
00989
00990
00991 if (allowablePhrase)
00992 saveAllHieroPhrases(startT, endT, startS, endS, holeColl, wordCountS);
00993
00994
00995 int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
00996 addHieroRule(startT, endT, startS, endS
00997 , ruleExist, holeColl, numHoles + 1, nextInitStartT
00998 , newWordCountT, newWordCountS);
00999
01000 holeColl.RemoveLast();
01001 }
01002 }
01003 }
01004 }
01005
01006 void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
01007 {
01008
01009 if (m_options.boundaryRules
01010 && ( (startS == 0 && endS == 0)
01011 || (startS == countS-1 && endS == countS-1))) {
01012 return;
01013 }
01014
01015 if (m_options.onlyOutputSpanInfo) {
01016 cout << startS << " " << endS << " " << startT << " " << endT << endl;
01017 return;
01018 }
01019
01020 ExtractedRule rule(startT, endT, startS, endS);
01021
01022
01023 string targetLabel,sourceLabel;
01024 if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
01025 if (m_sentence.targetTree.HasNode(startT,endT) && !m_options.targetSyntacticPreferences) {
01026 sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
01027 } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
01028 sourceLabel = "S";
01029 } else {
01030 sourceLabel = "X";
01031 }
01032 } else {
01033 sourceLabel = m_options.sourceSyntax ?
01034 m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
01035
01036 if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
01037
01038 targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
01039 } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
01040 targetLabel = "S";
01041 } else {
01042 targetLabel = "X";
01043 }
01044 }
01045
01046
01047 rule.source = "";
01048 for(int si=startS; si<=endS; si++)
01049 rule.source += m_sentence.source[si] + " ";
01050 rule.source += "[" + sourceLabel + "]";
01051
01052
01053 rule.target = "";
01054 for(int ti=startT; ti<=endT; ti++)
01055 rule.target += m_sentence.target[ti] + " ";
01056 rule.target += "[" + targetLabel + "]";
01057
01058 if (m_options.pcfgScore) {
01059 double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
01060 rule.pcfgScore = std::exp(logPCFGScore);
01061 }
01062
01063
01064 for(int ti=startT; ti<=endT; ti++) {
01065 for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
01066 int si = m_sentence.alignedToT[ti][i];
01067 std::string sourceSymbolIndex = IntToString(si-startS);
01068 std::string targetSymbolIndex = IntToString(ti-startT);
01069 rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
01070 if (!m_options.onlyDirectFlag)
01071 rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
01072 }
01073 }
01074
01075 rule.alignment.erase(rule.alignment.size()-1);
01076 if (!m_options.onlyDirectFlag)
01077 rule.alignmentInv.erase(rule.alignmentInv.size()-1);
01078
01079
01080 if (m_options.flexScoreFlag) {
01081 rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1];
01082 rule.sourceContextRight = endS+1 == m_sentence.source.size() ? "<s>" : m_sentence.source[endS+1];
01083 rule.targetContextLeft = startT == 0 ? "<s>" : m_sentence.target[startT-1];
01084 rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1];
01085 }
01086
01087
01088 if (m_options.phraseOrientation) {
01089 rule.l2rOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_L2R);
01090 rule.r2lOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L);
01091
01092
01093
01094 }
01095
01096
01097 if (m_options.targetSyntacticPreferences) {
01098 if (m_sentence.targetTree.HasNode(startT,endT)) {
01099 rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
01100 } else {
01101 rule.targetSyntacticPreference += "XLHS";
01102 }
01103 }
01104
01105 addRuleToCollection( rule );
01106 }
01107
01108 void ExtractTask::addRuleToCollection( ExtractedRule &newRule )
01109 {
01110
01111
01112 if (!m_options.duplicateRules) {
01113 vector<ExtractedRule>::const_iterator rule;
01114 for(rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01115 if (rule->source.compare( newRule.source ) == 0 &&
01116 rule->target.compare( newRule.target ) == 0 &&
01117 !(rule->endT < newRule.startT || rule->startT > newRule.endT)) {
01118 return;
01119 }
01120 }
01121 }
01122 m_extractedRules.push_back( newRule );
01123 }
01124
01125 void ExtractTask::consolidateRules()
01126 {
01127 typedef vector<ExtractedRule>::iterator R;
01128 map<int, map<int, map<int, map<int,int> > > > spanCount;
01129
01130
01131 if (m_options.fractionalCounting) {
01132 for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01133 spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ]++;
01134 }
01135 }
01136
01137
01138 for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01139 rule->count = 1.0/(float) (m_options.fractionalCounting ? spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ] : 1.0 );
01140 }
01141
01142
01143 map<std::string, map< std::string, map< std::string, float> > > consolidatedCount;
01144 for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01145 consolidatedCount[ rule->source ][ rule->target][ rule->alignment ] += rule->count;
01146 }
01147
01148 for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01149 float count = consolidatedCount[ rule->source ][ rule->target][ rule->alignment ];
01150 rule->count = count;
01151 consolidatedCount[ rule->source ][ rule->target][ rule->alignment ] = 0;
01152 }
01153 }
01154
01155 void ExtractTask::writeRulesToFile()
01156 {
01157 vector<ExtractedRule>::const_iterator rule;
01158 ostringstream out;
01159 ostringstream outInv;
01160 ostringstream outContext;
01161 ostringstream outContextInv;
01162 for(rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
01163 if (rule->count == 0)
01164 continue;
01165
01166 out << rule->source << " ||| "
01167 << rule->target << " ||| "
01168 << rule->alignment << " ||| "
01169 << rule->count << " ||| ";
01170 if (m_options.pcfgScore) {
01171 out << " ||| " << rule->pcfgScore;
01172 }
01173 if (m_options.phraseOrientation) {
01174 out << " {{Orientation ";
01175 m_phraseOrientation.WriteOrientation(out,rule->l2rOrientation);
01176 out << " ";
01177 m_phraseOrientation.WriteOrientation(out,rule->r2lOrientation);
01178 m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,rule->l2rOrientation,1);
01179 m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1);
01180 out << "}}";
01181 }
01182 if (m_options.targetSyntacticPreferences) {
01183 out << " {{TargetPreferences ";
01184 out << rule->targetSyntacticPreference;
01185 out << "}}";
01186 }
01187 out << "\n";
01188
01189 if (!m_options.onlyDirectFlag) {
01190 outInv << rule->target << " ||| "
01191 << rule->source << " ||| "
01192 << rule->alignmentInv << " ||| "
01193 << rule->count << "\n";
01194 }
01195
01196 if (m_options.flexScoreFlag) {
01197 for(int iContext=0; iContext<2; iContext++) {
01198 outContext << rule->source << " ||| "
01199 << rule->target << " ||| "
01200 << rule->alignment << " ||| ";
01201 iContext ? outContext << "< " << rule->sourceContextLeft << "\n" : outContext << "> " << rule->sourceContextRight << "\n";
01202
01203 if (!m_options.onlyDirectFlag) {
01204 outContextInv << rule->target << " ||| "
01205 << rule->source << " ||| "
01206 << rule->alignmentInv << " ||| ";
01207 iContext ? outContextInv << "< " << rule->targetContextLeft << "\n" : outContextInv << "> " << rule->targetContextRight << "\n";
01208 }
01209 }
01210
01211 if (rule->sourceHoleString != "") {
01212 outContext << rule->source << " ||| "
01213 << rule->target << " ||| "
01214 << rule->alignment << " ||| v "
01215 << rule->sourceHoleString << "\n";
01216 }
01217
01218 if (!m_options.onlyDirectFlag and rule->targetHoleString != "") {
01219 outContextInv << rule->target << " ||| "
01220 << rule->source << " ||| "
01221 << rule->alignmentInv << " ||| v "
01222 << rule->targetHoleString << "\n";
01223 }
01224 }
01225 }
01226 m_extractFile << out.str();
01227 m_extractFileInv << outInv.str();
01228 m_extractFileContext << outContext.str();
01229 m_extractFileContextInv << outContextInv.str();
01230 }
01231
01232 void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
01233 {
01234 ofstream grammarFile;
01235 grammarFile.open(fileName.c_str());
01236
01237 std::string glueRulesPhraseProperty = "";
01238 if (options.phraseOrientation) {
01239 glueRulesPhraseProperty.append(" {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
01240 }
01241 const size_t targetSyntacticPreferencesLabelGlueTop = 0;
01242 const size_t targetSyntacticPreferencesLabelGlueX = 1;
01243
01244 if (!options.targetSyntax || options.targetSyntacticPreferences) {
01245 grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0 ||| |||" << glueRulesPhraseProperty;
01246 if (options.targetSyntacticPreferences) {
01247 grammarFile << " {{TargetPreferences 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
01248 }
01249 grammarFile << std::endl;
01250 grammarFile << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0 ||| |||" << glueRulesPhraseProperty;
01251 if (options.targetSyntacticPreferences) {
01252 grammarFile << " {{TargetPreferences 2 1 " << targetSyntacticPreferencesLabelGlueTop << " 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
01253 }
01254 grammarFile << std::endl;
01255 grammarFile << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0 ||| |||" << glueRulesPhraseProperty;
01256 if (options.targetSyntacticPreferences) {
01257 grammarFile << " {{TargetPreferences 3 1 " << targetSyntacticPreferencesLabelGlueTop << " " << targetSyntacticPreferencesLabelGlueX << " 1 1 " << targetSyntacticPreferencesLabelGlueTop << " 1}}";
01258 }
01259 grammarFile << std::endl;
01260 } else {
01261
01262 string topLabel = "QQQQQQ";
01263 for( unsigned int i=1; i<=topLabel.length(); i++) {
01264 if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
01265 topLabel = topLabel.substr(0,i);
01266 break;
01267 }
01268 }
01269
01270 grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| 0-0" << std::endl
01271 << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1" << std::endl;
01272
01273
01274 for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
01275 i != targetTopLabelCollection.end(); i++ ) {
01276 grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2" << std::endl;
01277 }
01278
01279
01280 for( set<string>::const_iterator i = targetLabelCollection.begin();
01281 i != targetLabelCollection.end(); i++ ) {
01282 grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << std::endl;
01283 }
01284 grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << std::endl;
01285 }
01286 grammarFile.close();
01287 }
01288
01289
01290
01291
01292
01293 map<string,int> wordCount;
01294 map<string,string> wordLabel;
01295 void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
01296 {
01297 int countT = sentence.target.size();
01298 for(int ti=0; ti < countT; ti++) {
01299 string &word = sentence.target[ ti ];
01300 const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
01301 if (labels.size() > 0) {
01302 wordCount[ word ]++;
01303 wordLabel[ word ] = labels[0]->label;
01304 }
01305 }
01306 }
01307
01308 void writeUnknownWordLabel(const string & fileName)
01309 {
01310 ofstream outFile;
01311 outFile.open(fileName.c_str());
01312 typedef map<string,int>::const_iterator I;
01313
01314 map<string,int> count;
01315 int total = 0;
01316 for(I word = wordCount.begin(); word != wordCount.end(); word++) {
01317
01318 if (word->second == 1) {
01319 count[ wordLabel[ word->first ] ]++;
01320 total++;
01321 }
01322 }
01323
01324 for(I pos = count.begin(); pos != count.end(); pos++) {
01325 double ratio = ((double) pos->second / (double) total);
01326 if (ratio > 0.03)
01327 outFile << pos->first << " " << ratio << endl;
01328 }
01329
01330 outFile.close();
01331 }
01332
01333 void writePhraseOrientationPriors(const string &fileName)
01334 {
01335 ofstream outFile;
01336 outFile.open(fileName.c_str());
01337 PhraseOrientation::WritePriorCounts(outFile);
01338 outFile.close();
01339 }
01340
01341 double getPcfgScore(const SyntaxNode &node)
01342 {
01343 double score = 0.0f;
01344 SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
01345 if (p != node.attributes.end()) {
01346 score = std::atof(p->second.c_str());
01347 }
01348 return score;
01349 }