00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <sstream>
00021 #include <assert.h>
00022 #include <cstdlib>
00023 #include <cstring>
00024 #include <map>
00025 #include <set>
00026 #include <vector>
00027 #include <algorithm>
00028 #include <boost/algorithm/string/predicate.hpp>
00029 #include <boost/unordered_map.hpp>
00030
00031 #include "ScoreFeature.h"
00032 #include "tables-core.h"
00033 #include "ExtractionPhrasePair.h"
00034 #include "score.h"
00035 #include "InputFileStream.h"
00036 #include "OutputFileStream.h"
00037
00038 #include "moses/Util.h"
00039
00040 using namespace boost::algorithm;
00041 using namespace MosesTraining;
00042
00043 namespace MosesTraining
00044 {
00045 LexicalTable lexTable;
00046 bool inverseFlag = false;
00047 bool hierarchicalFlag = false;
00048 bool pcfgFlag = false;
00049 bool phraseOrientationFlag = false;
00050 bool treeFragmentsFlag = false;
00051 bool partsOfSpeechFlag = false;
00052 bool sourceSyntaxLabelsFlag = false;
00053 bool sourceSyntaxLabelCountsLHSFlag = false;
00054 bool targetSyntacticPreferencesFlag = false;
00055 bool unpairedExtractFormatFlag = false;
00056 bool conditionOnTargetLhsFlag = false;
00057 bool wordAlignmentFlag = true;
00058 bool goodTuringFlag = false;
00059 bool kneserNeyFlag = false;
00060 bool logProbFlag = false;
00061 int negLogProb = 1;
00062 #define COC_MAX 10
00063 bool lexFlag = true;
00064 bool unalignedFlag = false;
00065 bool unalignedFWFlag = false;
00066 bool crossedNonTerm = false;
00067 bool spanLength = false;
00068 bool ruleLength = false;
00069 bool nonTermContext = false;
00070 bool nonTermContextTarget = false;
00071 bool targetConstituentBoundariesFlag = false;
00072
00073 int countOfCounts[COC_MAX+1];
00074 int totalDistinct = 0;
00075 float minCount = 0;
00076 float minCountHierarchical = 0;
00077 bool phraseOrientationPriorsFlag = false;
00078
00079 boost::unordered_map<std::string,float> sourceLHSCounts;
00080 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
00081 std::set<std::string> sourceLabelSet;
00082 std::map<std::string,size_t> sourceLabels;
00083 std::vector<std::string> sourceLabelsByIndex;
00084
00085 std::set<std::string> partsOfSpeechSet;
00086
00087 boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
00088 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
00089 std::set<std::string> targetSyntacticPreferencesLabelSet;
00090 std::map<std::string,size_t> targetSyntacticPreferencesLabels;
00091 std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
00092
00093 std::vector<float> orientationClassPriorsL2R(4,0);
00094 std::vector<float> orientationClassPriorsR2L(4,0);
00095
00096 Vocabulary vcbT;
00097 Vocabulary vcbS;
00098
00099 }
00100
00101
00102 void processLine( std::string line,
00103 int lineID, bool includeSentenceIdFlag, int &sentenceId,
00104 PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
00105 std::string &additionalPropertiesString,
00106 float &count, float &pcfgSum );
00107 void writeCountOfCounts( const std::string &fileNameCountOfCounts );
00108 void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
00109 const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
00110 const std::string &fileNameLeftHandSideSourceLabelCounts,
00111 const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
00112 void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
00113 void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
00114 const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
00115 void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
00116 double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00117 double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
00118 std::set<std::string> functionWordList;
00119 void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
00120 void loadFunctionWords( const std::string &fileNameFunctionWords );
00121 double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00122 int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00123 void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
00124 void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
00125 void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
00126 size_t NumNonTerminal(const PHRASE *phraseSource);
00127
00128
00129 int main(int argc, char* argv[])
00130 {
00131 std::cerr << "Score v2.1 -- "
00132 << "scoring methods for extracted rules" << std::endl;
00133
00134 ScoreFeatureManager featureManager;
00135 if (argc < 4) {
00136 std::cerr <<
00137 "syntax: score extract lex phrase-table "
00138 "[--Inverse] "
00139 "[--Hierarchical] "
00140 "[--LogProb] "
00141 "[--NegLogProb] "
00142 "[--NoLex] "
00143 "[--GoodTuring] "
00144 "[--KneserNey] "
00145 "[--NoWordAlignment] "
00146 "[--UnalignedPenalty] "
00147 "[--UnalignedFunctionWordPenalty function-word-file] "
00148 "[--MinCountHierarchical count] "
00149 "[--PartsOfSpeech] "
00150 "[--PCFG] "
00151 "[--TreeFragments] "
00152 "[--SourceLabels] "
00153 "[--SourceLabelCountsLHS] "
00154 "[--TargetSyntacticPreferences] "
00155 "[--UnpairedExtractFormat] "
00156 "[--ConditionOnTargetLHS] "
00157 "[--CrossedNonTerm]"
00158 << std::endl;
00159 std::cerr << featureManager.usage() << std::endl;
00160 exit(1);
00161 }
00162 std::string fileNameExtract = argv[1];
00163 std::string fileNameLex = argv[2];
00164 std::string fileNamePhraseTable = argv[3];
00165 std::string fileNameSourceLabelSet;
00166 std::string fileNamePartsOfSpeechSet;
00167 std::string fileNameCountOfCounts;
00168 std::string fileNameFunctionWords;
00169 std::string fileNameLeftHandSideSourceLabelCounts;
00170 std::string fileNameLeftHandSideTargetSourceLabelCounts;
00171 std::string fileNameTargetSyntacticPreferencesLabelSet;
00172 std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
00173 std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
00174 std::string fileNamePhraseOrientationPriors;
00175
00176 std::vector<std::string> featureArgs;
00177
00178 for(int i=4; i<argc; i++) {
00179 if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
00180 inverseFlag = true;
00181 std::cerr << "using inverse mode" << std::endl;
00182 } else if (strcmp(argv[i],"--Hierarchical") == 0) {
00183 hierarchicalFlag = true;
00184 std::cerr << "processing hierarchical rules" << std::endl;
00185 } else if (strcmp(argv[i],"--PCFG") == 0) {
00186 pcfgFlag = true;
00187 std::cerr << "including PCFG scores" << std::endl;
00188 } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
00189 phraseOrientationFlag = true;
00190 std::cerr << "including phrase orientation information" << std::endl;
00191 } else if (strcmp(argv[i],"--TreeFragments") == 0) {
00192 treeFragmentsFlag = true;
00193 std::cerr << "including tree fragment information from syntactic parse" << std::endl;
00194 } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
00195 partsOfSpeechFlag = true;
00196 std::cerr << "including parts-of-speech information from syntactic parse" << std::endl;
00197 fileNamePartsOfSpeechSet = std::string(fileNamePhraseTable) + ".partsOfSpeech";
00198 std::cerr << "writing parts-of-speech set to file " << fileNamePartsOfSpeechSet << std::endl;
00199 } else if (strcmp(argv[i],"--SourceLabels") == 0) {
00200 sourceSyntaxLabelsFlag = true;
00201 std::cerr << "including source label information" << std::endl;
00202 fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
00203 std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
00204 } else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
00205 sourceSyntaxLabelCountsLHSFlag = true;
00206 fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
00207 fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
00208 std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
00209 } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
00210 targetSyntacticPreferencesFlag = true;
00211 std::cerr << "including target syntactic preferences information" << std::endl;
00212 fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
00213 std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
00214 fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
00215 fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
00216 std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
00217 << fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
00218 << " and "
00219 << fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
00220 << std::endl;
00221 } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
00222 unpairedExtractFormatFlag = true;
00223 std::cerr << "processing unpaired extract format" << std::endl;
00224 } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
00225 conditionOnTargetLhsFlag = true;
00226 std::cerr << "processing unpaired extract format" << std::endl;
00227 } else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
00228 wordAlignmentFlag = false;
00229 std::cerr << "omitting word alignment" << std::endl;
00230 } else if (strcmp(argv[i],"--NoLex") == 0) {
00231 lexFlag = false;
00232 std::cerr << "not computing lexical translation score" << std::endl;
00233 } else if (strcmp(argv[i],"--GoodTuring") == 0) {
00234 goodTuringFlag = true;
00235 fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
00236 std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
00237 } else if (strcmp(argv[i],"--KneserNey") == 0) {
00238 kneserNeyFlag = true;
00239 fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
00240 std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
00241 } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
00242 unalignedFlag = true;
00243 std::cerr << "using unaligned word penalty" << std::endl;
00244 } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
00245 unalignedFWFlag = true;
00246 if (i+1==argc) {
00247 std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
00248 exit(1);
00249 }
00250 fileNameFunctionWords = argv[++i];
00251 std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
00252 } else if (strcmp(argv[i],"--LogProb") == 0) {
00253 logProbFlag = true;
00254 std::cerr << "using log-probabilities" << std::endl;
00255 } else if (strcmp(argv[i],"--NegLogProb") == 0) {
00256 logProbFlag = true;
00257 negLogProb = -1;
00258 std::cerr << "using negative log-probabilities" << std::endl;
00259 } else if (strcmp(argv[i],"--MinCount") == 0) {
00260 minCount = std::atof( argv[++i] );
00261 std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
00262 minCount -= 0.00001;
00263 } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
00264 minCountHierarchical = std::atof( argv[++i] );
00265 std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
00266 minCountHierarchical -= 0.00001;
00267 } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
00268 crossedNonTerm = true;
00269 std::cerr << "crossed non-term reordering feature" << std::endl;
00270 } else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) {
00271 phraseOrientationPriorsFlag = true;
00272 if (i+1==argc) {
00273 std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl;
00274 exit(1);
00275 }
00276 fileNamePhraseOrientationPriors = argv[++i];
00277 std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl;
00278 } else if (strcmp(argv[i],"--SpanLength") == 0) {
00279 spanLength = true;
00280 std::cerr << "span length feature" << std::endl;
00281 } else if (strcmp(argv[i],"--RuleLength") == 0) {
00282 ruleLength = true;
00283 std::cerr << "rule length feature" << std::endl;
00284 } else if (strcmp(argv[i],"--NonTermContext") == 0) {
00285 nonTermContext = true;
00286 std::cerr << "non-term context" << std::endl;
00287 } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
00288 nonTermContextTarget = true;
00289 std::cerr << "non-term context (target)" << std::endl;
00290 } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
00291 targetConstituentBoundariesFlag = true;
00292 std::cerr << "including target constituent boundaries information" << std::endl;
00293 } else {
00294 featureArgs.push_back(argv[i]);
00295 ++i;
00296 for (; i < argc && strncmp(argv[i], "--", 2); ++i) {
00297 featureArgs.push_back(argv[i]);
00298 }
00299 if (i != argc) --i;
00300 }
00301 }
00302
00303 MaybeLog maybeLogProb(logProbFlag, negLogProb);
00304
00305
00306 if (!inverseFlag) {
00307 featureManager.configure(featureArgs);
00308 }
00309
00310
00311 if (lexFlag) {
00312 lexTable.load( fileNameLex );
00313 }
00314
00315
00316 if (unalignedFWFlag) {
00317 loadFunctionWords( fileNameFunctionWords );
00318 }
00319
00320
00321 if (goodTuringFlag || kneserNeyFlag) {
00322 for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
00323 }
00324
00325 if (phraseOrientationPriorsFlag) {
00326 loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L);
00327 }
00328
00329
00330 Moses::InputFileStream extractFile(fileNameExtract);
00331
00332 if (extractFile.fail()) {
00333 std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
00334 exit(1);
00335 }
00336
00337
00338 std::ostream *phraseTableFile;
00339
00340 if (fileNamePhraseTable == "-") {
00341 phraseTableFile = &std::cout;
00342 } else {
00343 Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
00344 bool success = outputFile->Open(fileNamePhraseTable);
00345 if (!success) {
00346 std::cerr << "ERROR: could not open file phrase table file "
00347 << fileNamePhraseTable << std::endl;
00348 exit(1);
00349 }
00350 phraseTableFile = outputFile;
00351 }
00352
00353
00354 std::string line, lastLine;
00355 ExtractionPhrasePair *phrasePair = NULL;
00356 std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
00357 std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget;
00358
00359 int tmpSentenceId;
00360 PHRASE *tmpPhraseSource, *tmpPhraseTarget;
00361 ALIGNMENT *tmpTargetToSourceAlignment;
00362 std::string tmpAdditionalPropertiesString;
00363 float tmpCount=0.0f, tmpPcfgSum=0.0f;
00364
00365 int i=0;
00366 if ( getline(extractFile, line) ) {
00367 ++i;
00368 tmpPhraseSource = new PHRASE();
00369 tmpPhraseTarget = new PHRASE();
00370 tmpTargetToSourceAlignment = new ALIGNMENT();
00371 processLine( std::string(line),
00372 i, featureManager.includeSentenceId(), tmpSentenceId,
00373 tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00374 tmpAdditionalPropertiesString,
00375 tmpCount, tmpPcfgSum);
00376 phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
00377 tmpTargetToSourceAlignment,
00378 tmpCount, tmpPcfgSum );
00379 phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00380 featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00381 phrasePairsWithSameSource.push_back( phrasePair );
00382 if ( hierarchicalFlag ) {
00383 phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
00384 }
00385 lastLine = line;
00386 }
00387
00388 while ( getline(extractFile, line) ) {
00389
00390
00391 if ( ++i % 100000 == 0 ) {
00392 std::cerr << "." << std::flush;
00393 }
00394
00395
00396 if (line == lastLine) {
00397 phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
00398 continue;
00399 } else {
00400 lastLine = line;
00401 }
00402
00403 tmpPhraseSource = new PHRASE();
00404 tmpPhraseTarget = new PHRASE();
00405 tmpTargetToSourceAlignment = new ALIGNMENT();
00406 tmpAdditionalPropertiesString.clear();
00407 processLine( std::string(line),
00408 i, featureManager.includeSentenceId(), tmpSentenceId,
00409 tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00410 tmpAdditionalPropertiesString,
00411 tmpCount, tmpPcfgSum);
00412
00413 bool matchesPrevious = false;
00414 bool sourceMatch = true;
00415 bool targetMatch = true;
00416 bool alignmentMatch = true;
00417
00418
00419
00420 if ( hierarchicalFlag ) {
00421 for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
00422 iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
00423 if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00424 sourceMatch, targetMatch, alignmentMatch ) ) {
00425 matchesPrevious = true;
00426 phrasePair = (*iter);
00427 break;
00428 }
00429 }
00430 } else {
00431 if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00432 sourceMatch, targetMatch, alignmentMatch ) ) {
00433 matchesPrevious = true;
00434 }
00435 }
00436
00437 if ( matchesPrevious ) {
00438 delete tmpPhraseSource;
00439 delete tmpPhraseTarget;
00440 if ( !phrasePair->Add( tmpTargetToSourceAlignment,
00441 tmpCount, tmpPcfgSum ) ) {
00442 delete tmpTargetToSourceAlignment;
00443 }
00444 phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00445 featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00446 } else {
00447
00448 if ( !phrasePairsWithSameSource.empty() &&
00449 !sourceMatch ) {
00450 processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
00451 for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00452 iter!=phrasePairsWithSameSource.end(); ++iter) {
00453 delete *iter;
00454 }
00455 phrasePairsWithSameSource.clear();
00456 if ( hierarchicalFlag ) {
00457 phrasePairsWithSameSourceAndTarget.clear();
00458 }
00459 }
00460
00461 if ( hierarchicalFlag ) {
00462 if ( !phrasePairsWithSameSourceAndTarget.empty() &&
00463 !targetMatch ) {
00464 phrasePairsWithSameSourceAndTarget.clear();
00465 }
00466 }
00467
00468 phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
00469 tmpTargetToSourceAlignment,
00470 tmpCount, tmpPcfgSum );
00471 phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00472 featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00473 phrasePairsWithSameSource.push_back(phrasePair);
00474
00475 if ( hierarchicalFlag ) {
00476 phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
00477 }
00478 }
00479
00480 }
00481
00482
00483 std::cerr << std::endl;
00484
00485 processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
00486 for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00487 iter!=phrasePairsWithSameSource.end(); ++iter) {
00488 delete *iter;
00489 }
00490 phrasePairsWithSameSource.clear();
00491
00492
00493 phraseTableFile->flush();
00494 if (phraseTableFile != &std::cout) {
00495 delete phraseTableFile;
00496 }
00497
00498
00499 if (goodTuringFlag || kneserNeyFlag) {
00500 writeCountOfCounts( fileNameCountOfCounts );
00501 }
00502
00503
00504 if (sourceSyntaxLabelsFlag && !inverseFlag) {
00505 writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
00506 }
00507 if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
00508 writeLeftHandSideLabelCounts( sourceLHSCounts,
00509 targetLHSAndSourceLHSJointCounts,
00510 fileNameLeftHandSideSourceLabelCounts,
00511 fileNameLeftHandSideTargetSourceLabelCounts );
00512 }
00513
00514
00515 if (partsOfSpeechFlag && !inverseFlag) {
00516 writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
00517 }
00518
00519
00520 if (targetSyntacticPreferencesFlag && !inverseFlag) {
00521 writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
00522 writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
00523 ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
00524 fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
00525 fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
00526 }
00527 }
00528
00529
00530 void processLine( std::string line,
00531 int lineID, bool includeSentenceIdFlag, int &sentenceId,
00532 PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
00533 std::string &additionalPropertiesString,
00534 float &count, float &pcfgSum )
00535 {
00536 size_t foundAdditionalProperties = line.rfind("|||");
00537 foundAdditionalProperties = line.find("{{",foundAdditionalProperties);
00538 if (foundAdditionalProperties != std::string::npos) {
00539 additionalPropertiesString = line.substr(foundAdditionalProperties);
00540 line = line.substr(0,foundAdditionalProperties);
00541 } else {
00542 additionalPropertiesString.clear();
00543 }
00544
00545 phraseSource->clear();
00546 phraseTarget->clear();
00547 targetToSourceAlignment->clear();
00548
00549 std::vector<std::string> token;
00550 Moses::Tokenize( token, line );
00551 int item = 1;
00552 for ( size_t j=0; j<token.size(); ++j ) {
00553 if (token[j] == "|||") {
00554 ++item;
00555 } else if (item == 1) {
00556 phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
00557 } else if (item == 2) {
00558 phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
00559 } else if (item == 3) {
00560 int s,t;
00561 sscanf(token[j].c_str(), "%d-%d", &s, &t);
00562 if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
00563 std::cerr << "WARNING: phrase pair " << lineID
00564 << " has alignment point (" << s << ", " << t << ")"
00565 << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
00566 << std::endl;
00567 } else {
00568
00569 if ( targetToSourceAlignment->size() == 0 ) {
00570 size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
00571 targetToSourceAlignment->resize(numberOfTargetSymbols);
00572 }
00573
00574 targetToSourceAlignment->at(t).insert(s);
00575 }
00576 } else if (includeSentenceIdFlag && item == 4) {
00577 sscanf(token[j].c_str(), "%d", &sentenceId);
00578 } else if (item + (includeSentenceIdFlag?-1:0) == 4) {
00579 sscanf(token[j].c_str(), "%f", &count);
00580 } else if (item + (includeSentenceIdFlag?-1:0) == 5) {
00581 float pcfgScore = std::atof( token[j].c_str() );
00582 pcfgSum = pcfgScore * count;
00583 }
00584 }
00585
00586 if ( targetToSourceAlignment->size() == 0 ) {
00587 size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
00588 targetToSourceAlignment->resize(numberOfTargetSymbols);
00589 }
00590
00591 if (item + (includeSentenceIdFlag?-1:0) == 3) {
00592 count = 1.0;
00593 }
00594 if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
00595 std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
00596 }
00597
00598 }
00599
00600
00601 void writeCountOfCounts( const std::string &fileNameCountOfCounts )
00602 {
00603
00604 Moses::OutputFileStream countOfCountsFile;
00605 bool success = countOfCountsFile.Open(fileNameCountOfCounts);
00606 if (!success) {
00607 std::cerr << "ERROR: could not open count-of-counts file "
00608 << fileNameCountOfCounts << std::endl;
00609 return;
00610 }
00611
00612
00613 countOfCountsFile << totalDistinct << std::endl;
00614
00615
00616 for(int i=1; i<=COC_MAX; i++) {
00617 countOfCountsFile << countOfCounts[ i ] << std::endl;
00618 }
00619 countOfCountsFile.Close();
00620 }
00621
00622
00623 void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
00624 const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
00625 const std::string &fileNameLeftHandSideSourceLabelCounts,
00626 const std::string &fileNameLeftHandSideTargetSourceLabelCounts )
00627 {
00628
00629 Moses::OutputFileStream leftHandSideSourceLabelCounts;
00630 bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
00631 if (!success) {
00632 std::cerr << "ERROR: could not open left-hand side label counts file "
00633 << fileNameLeftHandSideSourceLabelCounts << std::endl;
00634 return;
00635 }
00636
00637
00638 for (boost::unordered_map<std::string,float>::const_iterator iter=sourceLHSCounts.begin();
00639 iter!=sourceLHSCounts.end(); ++iter) {
00640 leftHandSideSourceLabelCounts << iter->first << " " << iter->second << std::endl;
00641 }
00642
00643 leftHandSideSourceLabelCounts.Close();
00644
00645
00646 Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
00647 success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
00648 if (!success) {
00649 std::cerr << "ERROR: could not open left-hand side label joint counts file "
00650 << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
00651 return;
00652 }
00653
00654
00655 for (boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::const_iterator iter=targetLHSAndSourceLHSJointCounts.begin();
00656 iter!=targetLHSAndSourceLHSJointCounts.end(); ++iter) {
00657 for (boost::unordered_map<std::string,float>::const_iterator iter2=(iter->second)->begin();
00658 iter2!=(iter->second)->end(); ++iter2) {
00659 leftHandSideTargetSourceLabelCounts << iter->first << " "<< iter2->first << " " << iter2->second << std::endl;
00660 }
00661 }
00662
00663 leftHandSideTargetSourceLabelCounts.Close();
00664 }
00665
00666
00667 void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName )
00668 {
00669
00670 Moses::OutputFileStream out;
00671 bool success = out.Open(fileName);
00672 if (!success) {
00673 std::cerr << "ERROR: could not open file "
00674 << fileName << " for writing" << std::endl;
00675 return;
00676 }
00677
00678 for (std::set<std::string>::const_iterator iter=labelSet.begin();
00679 iter!=labelSet.end(); ++iter) {
00680 out << *iter << std::endl;
00681 }
00682
00683 out.Close();
00684 }
00685
00686
00687 void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
00688 const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
00689 {
00690 if (phrasePairsWithSameSource.size() == 0) {
00691 return;
00692 }
00693
00694 float totalSource = 0;
00695
00696
00697
00698
00699 for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00700 iter!=phrasePairsWithSameSource.end(); ++iter) {
00701
00702 totalSource += (*iter)->GetCount();
00703 }
00704
00705
00706 for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00707 iter!=phrasePairsWithSameSource.end(); ++iter) {
00708
00709 outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
00710 }
00711 }
00712
00713 void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
00714 float totalCount, int distinctCount,
00715 std::ostream &phraseTableFile,
00716 const ScoreFeatureManager& featureManager,
00717 const MaybeLog& maybeLogProb )
00718 {
00719 assert(phrasePair.IsValid());
00720
00721 const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
00722 float count = phrasePair.GetCount();
00723
00724 std::map< std::string, float > domainCount;
00725
00726
00727 if (goodTuringFlag || kneserNeyFlag) {
00728 totalDistinct++;
00729 int countInt = count + 0.99999;
00730 if ((countInt <= COC_MAX) &&
00731 (countInt > 0))
00732 countOfCounts[ countInt ]++;
00733 }
00734
00735
00736 const PHRASE *phraseSource = phrasePair.GetSource();
00737 const PHRASE *phraseTarget = phrasePair.GetTarget();
00738
00739
00740 if (count < minCount) {
00741 return;
00742 }
00743
00744
00745 if (hierarchicalFlag && count < minCountHierarchical) {
00746 for(size_t j=0; j<phraseSource->size()-1; ++j) {
00747 if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00748 return;
00749 }
00750 }
00751
00752
00753 float pcfgScore = 0;
00754 if (pcfgFlag && !inverseFlag) {
00755 pcfgScore = phrasePair.GetPcfgScore() / count;
00756 }
00757
00758
00759 if (!inverseFlag) {
00760 printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00761 phraseTableFile << " ||| ";
00762 }
00763
00764
00765 printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00766 phraseTableFile << " ||| ";
00767
00768
00769 if (inverseFlag) {
00770 printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00771 phraseTableFile << " ||| ";
00772 }
00773
00774
00775 if ( hierarchicalFlag ) {
00776
00777 assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
00778 std::vector<std::string> alignment;
00779 for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
00780 if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
00781 if ( bestAlignmentT2S->at(j).size() != 1 ) {
00782 std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
00783 phraseTableFile.flush();
00784 assert(bestAlignmentT2S->at(j).size() == 1);
00785 }
00786 size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
00787
00788 std::stringstream point;
00789 point << sourcePos << "-" << j;
00790 alignment.push_back(point.str());
00791 } else {
00792 for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
00793 setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
00794 size_t sourcePos = *setIter;
00795 std::stringstream point;
00796 point << sourcePos << "-" << j;
00797 alignment.push_back(point.str());
00798 }
00799 }
00800 }
00801
00802 sort(alignment.begin(), alignment.end());
00803 for (size_t i = 0; i < alignment.size(); ++i) {
00804 phraseTableFile << alignment[i] << " ";
00805 }
00806 } else if ( !inverseFlag && wordAlignmentFlag) {
00807
00808 for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
00809 for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
00810 setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
00811 size_t sourcePos = *setIter;
00812 phraseTableFile << sourcePos << "-" << j << " ";
00813 }
00814 }
00815 }
00816
00817 phraseTableFile << " ||| ";
00818
00819
00820 if (lexFlag) {
00821 double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
00822 phraseTableFile << maybeLogProb( lexScore );
00823 }
00824
00825
00826 if (unalignedFlag) {
00827 double penalty = computeUnalignedPenalty( bestAlignmentT2S );
00828 phraseTableFile << " " << maybeLogProb( penalty );
00829 }
00830
00831
00832 if (unalignedFWFlag) {
00833 double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
00834 phraseTableFile << " " << maybeLogProb( penalty );
00835 }
00836
00837 if (crossedNonTerm && !inverseFlag) {
00838 phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
00839 }
00840
00841
00842 if (pcfgFlag && !inverseFlag) {
00843 phraseTableFile << " " << maybeLogProb( pcfgScore );
00844 }
00845
00846
00847 ScoreFeatureContext context(phrasePair, maybeLogProb);
00848 std::vector<float> extraDense;
00849 std::map<std::string,float> extraSparse;
00850 featureManager.addFeatures(context, extraDense, extraSparse);
00851 for (size_t i = 0; i < extraDense.size(); ++i) {
00852 phraseTableFile << " " << extraDense[i];
00853 }
00854
00855 for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
00856 i != extraSparse.end(); ++i) {
00857 phraseTableFile << " " << i->first << " " << i->second;
00858 }
00859
00860
00861 phraseTableFile << " ||| " << totalCount << " " << count;
00862 if (kneserNeyFlag)
00863 phraseTableFile << " " << distinctCount;
00864
00865 phraseTableFile << " |||";
00866
00867
00868 if (treeFragmentsFlag && !inverseFlag) {
00869 const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
00870 if (bestTreeFragment) {
00871 phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
00872 }
00873 }
00874
00875
00876 if (partsOfSpeechFlag && !inverseFlag) {
00877 phrasePair.UpdateVocabularyFromValueTokens("POS", partsOfSpeechSet);
00878 const std::string *bestPartOfSpeech = phrasePair.FindBestPropertyValue("POS");
00879 if (bestPartOfSpeech) {
00880 phraseTableFile << " {{POS " << *bestPartOfSpeech << "}}";
00881 }
00882 }
00883
00884
00885 if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
00886 unsigned nNTs = 1;
00887 for(size_t j=0; j<phraseSource->size()-1; ++j) {
00888 if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00889 ++nNTs;
00890 }
00891
00892 if (sourceSyntaxLabelsFlag) {
00893 std::string sourceLabelCounts;
00894 sourceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("SourceLabels",
00895 sourceLabelSet,
00896 sourceLHSCounts,
00897 targetLHSAndSourceLHSJointCounts,
00898 vcbT);
00899 if ( !sourceLabelCounts.empty() ) {
00900 phraseTableFile << " {{SourceLabels "
00901 << phraseSource->size()
00902 << " "
00903 << count
00904 << sourceLabelCounts
00905 << "}}";
00906 }
00907 }
00908
00909 if (targetSyntacticPreferencesFlag) {
00910 std::string targetSyntacticPreferencesLabelCounts;
00911 targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
00912 targetSyntacticPreferencesLabelSet,
00913 targetSyntacticPreferencesLHSCounts,
00914 ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
00915 vcbT);
00916 if (!targetSyntacticPreferencesLabelCounts.empty()) {
00917 phraseTableFile << " {{TargetPreferences "
00918 << nNTs
00919 << " "
00920 << count
00921 << targetSyntacticPreferencesLabelCounts
00922 << "}}";
00923 }
00924 }
00925 }
00926
00927
00928 if (phraseOrientationFlag && !inverseFlag) {
00929 phraseTableFile << " {{Orientation ";
00930 phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile);
00931 phraseTableFile << "}}";
00932 }
00933
00934 if (spanLength && !inverseFlag) {
00935 std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
00936 if (!propValue.empty()) {
00937 phraseTableFile << " {{SpanLength " << propValue << "}}";
00938 }
00939 }
00940
00941 if (ruleLength && !inverseFlag) {
00942 std::string propValue = phrasePair.CollectAllPropertyValues("RuleLength");
00943 if (!propValue.empty()) {
00944 phraseTableFile << " {{RuleLength " << propValue << "}}";
00945 }
00946 }
00947
00948 if (nonTermContext && !inverseFlag) {
00949 std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
00950 if (!propValue.empty() && propValue.size() < 50000) {
00951 size_t nNTs = NumNonTerminal(phraseSource);
00952 phraseTableFile << " {{NonTermContext " << nNTs << " " << propValue << "}}";
00953 }
00954 }
00955
00956 if (nonTermContextTarget && !inverseFlag) {
00957 std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContextTarget");
00958 if (!propValue.empty() && propValue.size() < 50000) {
00959 size_t nNTs = NumNonTerminal(phraseSource);
00960 phraseTableFile << " {{NonTermContextTarget " << nNTs << " " << propValue << "}}";
00961 }
00962 }
00963
00964
00965 if (targetConstituentBoundariesFlag && !inverseFlag) {
00966 const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
00967 if (!targetConstituentBoundariesLeftValues.empty()) {
00968 phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
00969 }
00970 const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
00971 if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
00972 phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
00973 }
00974 }
00975
00976 phraseTableFile << std::endl;
00977 }
00978
00979 size_t NumNonTerminal(const PHRASE *phraseSource)
00980 {
00981 size_t nNTs = 0;
00982 for(size_t j=0; j<phraseSource->size()-1; ++j) {
00983 if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00984 ++nNTs;
00985 }
00986 return nNTs;
00987 }
00988
00989 void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
00990 std::vector<float> &orientationClassPriorsL2R,
00991 std::vector<float> &orientationClassPriorsR2L)
00992 {
00993 assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4);
00994
00995 std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
00996 Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
00997 if (inFile.fail()) {
00998 std::cerr << " - ERROR: could not open file" << std::endl;
00999 exit(1);
01000 }
01001
01002 std::string line;
01003 size_t linesRead = 0;
01004 float l2rSum = 0;
01005 float r2lSum = 0;
01006 while (getline(inFile, line)) {
01007 std::istringstream tokenizer(line);
01008 std::string key;
01009 tokenizer >> key;
01010
01011 bool l2rFlag = false;
01012 bool r2lFlag = false;
01013 if (starts_with(key, "L2R_")) {
01014 l2rFlag = true;
01015 }
01016 if (starts_with(key, "R2L_")) {
01017 r2lFlag = true;
01018 }
01019 if (!l2rFlag && !r2lFlag) {
01020 std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
01021 }
01022 key.erase(0,4);
01023
01024 int orientationClassId = -1;
01025 if (!key.compare("mono")) {
01026 orientationClassId = 0;
01027 }
01028 if (!key.compare("swap")) {
01029 orientationClassId = 1;
01030 }
01031 if (!key.compare("dleft")) {
01032 orientationClassId = 2;
01033 }
01034 if (!key.compare("dright")) {
01035 orientationClassId = 3;
01036 }
01037 if (orientationClassId == -1) {
01038 std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
01039 }
01040
01041 float count;
01042 tokenizer >> count;
01043
01044 if (l2rFlag) {
01045 orientationClassPriorsL2R[orientationClassId] += count;
01046 l2rSum += count;
01047 }
01048 if (r2lFlag) {
01049 orientationClassPriorsR2L[orientationClassId] += count;
01050 r2lSum += count;
01051 }
01052
01053 ++linesRead;
01054 }
01055
01056
01057 if (l2rSum != 0) {
01058 for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin();
01059 orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) {
01060 *orientationClassPriorsL2RIt /= l2rSum;
01061 }
01062 }
01063 if (r2lSum != 0) {
01064 for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin();
01065 orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) {
01066 *orientationClassPriorsR2LIt /= r2lSum;
01067 }
01068 }
01069
01070 std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
01071 inFile.Close();
01072 }
01073
01074
01075
01076 bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
01077 {
01078 for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
01079 if (currTarget == targetPos) {
01080
01081 } else {
01082 const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
01083 for (std::set<size_t>::const_iterator iter = sourceSet.begin();
01084 iter != sourceSet.end(); ++iter) {
01085 size_t currSource = *iter;
01086
01087 if ((currTarget < targetPos && currSource > sourcePos)
01088 || (currTarget > targetPos && currSource < sourcePos)
01089 ) {
01090 return true;
01091 }
01092 }
01093
01094 }
01095 }
01096
01097 return false;
01098 }
01099
01100 int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01101 {
01102 assert(phraseTarget->size() >= alignmentTargetToSource->size() );
01103
01104 for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
01105
01106 if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
01107 const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
01108 assert( alignmentPoints.size() == 1 );
01109 size_t sourcePos = *alignmentPoints.begin();
01110 bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
01111 if (ret)
01112 return 1;
01113 }
01114 }
01115
01116 return 0;
01117 }
01118
01119
01120 double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
01121 {
01122
01123 double unaligned = 1.0;
01124
01125 for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
01126 const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01127 if (srcIndices.empty()) {
01128 unaligned *= 2.718;
01129 }
01130 }
01131 return unaligned;
01132 }
01133
01134
01135 double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01136 {
01137
01138 double unaligned = 1.0;
01139
01140 for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
01141 const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01142 if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
01143 unaligned *= 2.718;
01144 }
01145 }
01146 return unaligned;
01147 }
01148
01149 void loadFunctionWords( const std::string &fileName )
01150 {
01151 std::cerr << "Loading function word list from " << fileName;
01152 Moses::InputFileStream inFile(fileName);
01153 if (inFile.fail()) {
01154 std::cerr << " - ERROR: could not open file" << std::endl;
01155 exit(1);
01156 }
01157
01158 std::string line;
01159 while(getline(inFile, line)) {
01160 std::vector<std::string> token;
01161 Moses::Tokenize( token, line );
01162 if (token.size() > 0)
01163 functionWordList.insert( token[0] );
01164 }
01165
01166 std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
01167 inFile.Close();
01168 }
01169
01170
01171 double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01172 {
01173
01174 double lexScore = 1.0;
01175 int null = vcbS.getWordID("NULL");
01176
01177 for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
01178 const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01179 if (srcIndices.empty()) {
01180
01181 lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
01182 } else {
01183
01184 double thisWordScore = 0;
01185 for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
01186 thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
01187 }
01188 lexScore *= thisWordScore / (double)srcIndices.size();
01189 }
01190 }
01191 return lexScore;
01192 }
01193
01194
01195 void LexicalTable::load( const std::string &fileName )
01196 {
01197 std::cerr << "Loading lexical translation table from " << fileName;
01198 Moses::InputFileStream inFile(fileName);
01199 if (inFile.fail()) {
01200 std::cerr << " - ERROR: could not open file" << std::endl;
01201 exit(1);
01202 }
01203
01204 std::string line;
01205 int i=0;
01206 while(getline(inFile, line)) {
01207 i++;
01208 if (i%100000 == 0) std::cerr << "." << std::flush;
01209
01210 std::vector<std::string> token;
01211 Moses::Tokenize( token, line );
01212 if (token.size() != 3) {
01213 std::cerr << "line " << i << " in " << fileName
01214 << " has wrong number of tokens, skipping:" << std::endl
01215 << token.size() << " " << token[0] << " " << line << std::endl;
01216 continue;
01217 }
01218
01219 double prob = std::atof( token[2].c_str() );
01220 WORD_ID wordT = vcbT.storeIfNew( token[0] );
01221 WORD_ID wordS = vcbS.storeIfNew( token[1] );
01222 ltable[ wordS ][ wordT ] = prob;
01223 }
01224 std::cerr << std::endl;
01225 }
01226
01227
01228 void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01229 const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
01230 {
01231
01232 ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
01233 invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
01234
01235 for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
01236 const std::string &word = vcbS.getWord(phraseSource->at(i));
01237 if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
01238 out << word << " ";
01239 continue;
01240 }
01241 const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
01242 assert(alignmentPoints.size() == 1);
01243 size_t j = *(alignmentPoints.begin());
01244 if (inverseFlag) {
01245 out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
01246 } else {
01247 out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
01248 }
01249 }
01250
01251 if (conditionOnTargetLhsFlag && !inverseFlag) {
01252 out << "[X]";
01253 } else {
01254 out << vcbS.getWord(phraseSource->back());
01255 }
01256 delete sourceToTargetAlignment;
01257 }
01258
01259
01260 void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01261 const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
01262 {
01263
01264 for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
01265 const std::string &word = vcbT.getWord(phraseTarget->at(i));
01266 if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
01267 out << word << " ";
01268 continue;
01269 }
01270
01271 std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
01272 assert(alignmentPoints.size() == 1);
01273 int j = *(alignmentPoints.begin());
01274 if (inverseFlag) {
01275 out << word << vcbS.getWord(phraseSource->at(j)) << " ";
01276 } else {
01277 out << vcbS.getWord(phraseSource->at(j)) << word << " ";
01278 }
01279 }
01280
01281 if (conditionOnTargetLhsFlag) {
01282 if (inverseFlag) {
01283 out << "[X]";
01284 } else {
01285 out << vcbS.getWord(phraseSource->back());
01286 }
01287 } else {
01288 out << vcbT.getWord(phraseTarget->back());
01289 }
01290 }
01291
01292
01293 void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01294 const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment)
01295 {
01296
01297
01298 outSourceToTargetAlignment->clear();
01299 size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
01300 outSourceToTargetAlignment->resize(numberOfSourceSymbols);
01301
01302 for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
01303 for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
01304 setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
01305 size_t sourcePosition = *setIter;
01306 outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
01307 }
01308 }
01309 }
01310