Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/score-main.cpp Source File

00001 /***********************************************************************
00002   Moses - factored phrase-based language decoder
00003   Copyright (C) 2009 University of Edinburgh
00004 
00005   This library is free software; you can redistribute it and/or
00006   modify it under the terms of the GNU Lesser General Public
00007   License as published by the Free Software Foundation; either
00008   version 2.1 of the License, or (at your option) any later version.
00009 
00010   This library is distributed in the hope that it will be useful,
00011   but WITHOUT ANY WARRANTY; without even the implied warranty of
00012   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013   Lesser General Public License for more details.
00014 
00015   You should have received a copy of the GNU Lesser General Public
00016   License along with this library; if not, write to the Free Software
00017   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00018  ***********************************************************************/
00019 
00020 #include <sstream>
00021 #include <assert.h>
00022 #include <cstdlib>
00023 #include <cstring>
00024 #include <map>
00025 #include <set>
00026 #include <vector>
00027 #include <algorithm>
00028 #include <boost/algorithm/string/predicate.hpp>
00029 #include <boost/unordered_map.hpp>
00030 
00031 #include "ScoreFeature.h"
00032 #include "tables-core.h"
00033 #include "ExtractionPhrasePair.h"
00034 #include "score.h"
00035 #include "InputFileStream.h"
00036 #include "OutputFileStream.h"
00037 
00038 #include "moses/Util.h"
00039 
00040 using namespace boost::algorithm;
00041 using namespace MosesTraining;
00042 
00043 namespace MosesTraining
00044 {
00045 LexicalTable lexTable;
00046 bool inverseFlag = false;
00047 bool hierarchicalFlag = false;
00048 bool pcfgFlag = false;
00049 bool phraseOrientationFlag = false;
00050 bool treeFragmentsFlag = false;
00051 bool partsOfSpeechFlag = false;
00052 bool sourceSyntaxLabelsFlag = false;
00053 bool sourceSyntaxLabelCountsLHSFlag = false;
00054 bool targetSyntacticPreferencesFlag = false;
00055 bool unpairedExtractFormatFlag = false;
00056 bool conditionOnTargetLhsFlag = false;
00057 bool wordAlignmentFlag = true;
00058 bool goodTuringFlag = false;
00059 bool kneserNeyFlag = false;
00060 bool logProbFlag = false;
00061 int negLogProb = 1;
00062 #define COC_MAX 10
00063 bool lexFlag = true;
00064 bool unalignedFlag = false;
00065 bool unalignedFWFlag = false;
00066 bool crossedNonTerm = false;
00067 bool spanLength = false;
00068 bool ruleLength = false;
00069 bool nonTermContext = false;
00070 bool nonTermContextTarget = false;
00071 bool targetConstituentBoundariesFlag = false;
00072 
00073 int countOfCounts[COC_MAX+1];
00074 int totalDistinct = 0;
00075 float minCount = 0;
00076 float minCountHierarchical = 0;
00077 bool phraseOrientationPriorsFlag = false;
00078 
00079 boost::unordered_map<std::string,float> sourceLHSCounts;
00080 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
00081 std::set<std::string> sourceLabelSet;
00082 std::map<std::string,size_t> sourceLabels;
00083 std::vector<std::string> sourceLabelsByIndex;
00084 
00085 std::set<std::string> partsOfSpeechSet;
00086 
00087 boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
00088 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
00089 std::set<std::string> targetSyntacticPreferencesLabelSet;
00090 std::map<std::string,size_t> targetSyntacticPreferencesLabels;
00091 std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
00092 
00093 std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright
00094 std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright
00095 
00096 Vocabulary vcbT;
00097 Vocabulary vcbS;
00098 
00099 } // namespace
00100 
00101 
00102 void processLine( std::string line,
00103                   int lineID, bool includeSentenceIdFlag, int &sentenceId,
00104                   PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
00105                   std::string &additionalPropertiesString,
00106                   float &count, float &pcfgSum );
00107 void writeCountOfCounts( const std::string &fileNameCountOfCounts );
00108 void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
00109                                    const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
00110                                    const std::string &fileNameLeftHandSideSourceLabelCounts,
00111                                    const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
00112 void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
00113 void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
00114                          const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
00115 void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
00116 double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00117 double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
00118 std::set<std::string> functionWordList;
00119 void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
00120 void loadFunctionWords( const std::string &fileNameFunctionWords );
00121 double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00122 int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
00123 void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
00124 void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
00125 void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
00126 size_t NumNonTerminal(const PHRASE *phraseSource);
00127 
00128 
00129 int main(int argc, char* argv[])
00130 {
00131   std::cerr << "Score v2.1 -- "
00132             << "scoring methods for extracted rules" << std::endl;
00133 
00134   ScoreFeatureManager featureManager;
00135   if (argc < 4) {
00136     std::cerr <<
00137               "syntax: score extract lex phrase-table "
00138               "[--Inverse] "
00139               "[--Hierarchical] "
00140               "[--LogProb] "
00141               "[--NegLogProb] "
00142               "[--NoLex] "
00143               "[--GoodTuring] "
00144               "[--KneserNey] "
00145               "[--NoWordAlignment] "
00146               "[--UnalignedPenalty] "
00147               "[--UnalignedFunctionWordPenalty function-word-file] "
00148               "[--MinCountHierarchical count] "
00149               "[--PartsOfSpeech] "
00150               "[--PCFG] "
00151               "[--TreeFragments] "
00152               "[--SourceLabels] "
00153               "[--SourceLabelCountsLHS] "
00154               "[--TargetSyntacticPreferences] "
00155               "[--UnpairedExtractFormat] "
00156               "[--ConditionOnTargetLHS] "
00157               "[--CrossedNonTerm]"
00158               << std::endl;
00159     std::cerr << featureManager.usage() << std::endl;
00160     exit(1);
00161   }
00162   std::string fileNameExtract = argv[1];
00163   std::string fileNameLex = argv[2];
00164   std::string fileNamePhraseTable = argv[3];
00165   std::string fileNameSourceLabelSet;
00166   std::string fileNamePartsOfSpeechSet;
00167   std::string fileNameCountOfCounts;
00168   std::string fileNameFunctionWords;
00169   std::string fileNameLeftHandSideSourceLabelCounts;
00170   std::string fileNameLeftHandSideTargetSourceLabelCounts;
00171   std::string fileNameTargetSyntacticPreferencesLabelSet;
00172   std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
00173   std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
00174   std::string fileNamePhraseOrientationPriors;
00175   // All unknown args are passed to feature manager.
00176   std::vector<std::string> featureArgs;
00177 
00178   for(int i=4; i<argc; i++) {
00179     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
00180       inverseFlag = true;
00181       std::cerr << "using inverse mode" << std::endl;
00182     } else if (strcmp(argv[i],"--Hierarchical") == 0) {
00183       hierarchicalFlag = true;
00184       std::cerr << "processing hierarchical rules" << std::endl;
00185     } else if (strcmp(argv[i],"--PCFG") == 0) {
00186       pcfgFlag = true;
00187       std::cerr << "including PCFG scores" << std::endl;
00188     } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
00189       phraseOrientationFlag = true;
00190       std::cerr << "including phrase orientation information" << std::endl;
00191     } else if (strcmp(argv[i],"--TreeFragments") == 0) {
00192       treeFragmentsFlag = true;
00193       std::cerr << "including tree fragment information from syntactic parse" << std::endl;
00194     } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
00195       partsOfSpeechFlag = true;
00196       std::cerr << "including parts-of-speech information from syntactic parse" << std::endl;
00197       fileNamePartsOfSpeechSet = std::string(fileNamePhraseTable) + ".partsOfSpeech";
00198       std::cerr << "writing parts-of-speech set to file " << fileNamePartsOfSpeechSet << std::endl;
00199     } else if (strcmp(argv[i],"--SourceLabels") == 0) {
00200       sourceSyntaxLabelsFlag = true;
00201       std::cerr << "including source label information" << std::endl;
00202       fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
00203       std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
00204     } else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
00205       sourceSyntaxLabelCountsLHSFlag = true;
00206       fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
00207       fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
00208       std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
00209     } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
00210       targetSyntacticPreferencesFlag = true;
00211       std::cerr << "including target syntactic preferences information" << std::endl;
00212       fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
00213       std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
00214       fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
00215       fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
00216       std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
00217                 << fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
00218                 << " and "
00219                 << fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
00220                 << std::endl;
00221     } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
00222       unpairedExtractFormatFlag = true;
00223       std::cerr << "processing unpaired extract format" << std::endl;
00224     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
00225       conditionOnTargetLhsFlag = true;
00226       std::cerr << "processing unpaired extract format" << std::endl;
00227     } else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
00228       wordAlignmentFlag = false;
00229       std::cerr << "omitting word alignment" << std::endl;
00230     } else if (strcmp(argv[i],"--NoLex") == 0) {
00231       lexFlag = false;
00232       std::cerr << "not computing lexical translation score" << std::endl;
00233     } else if (strcmp(argv[i],"--GoodTuring") == 0) {
00234       goodTuringFlag = true;
00235       fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
00236       std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
00237     } else if (strcmp(argv[i],"--KneserNey") == 0) {
00238       kneserNeyFlag = true;
00239       fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
00240       std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
00241     } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
00242       unalignedFlag = true;
00243       std::cerr << "using unaligned word penalty" << std::endl;
00244     } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
00245       unalignedFWFlag = true;
00246       if (i+1==argc) {
00247         std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
00248         exit(1);
00249       }
00250       fileNameFunctionWords = argv[++i];
00251       std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
00252     }  else if (strcmp(argv[i],"--LogProb") == 0) {
00253       logProbFlag = true;
00254       std::cerr << "using log-probabilities" << std::endl;
00255     } else if (strcmp(argv[i],"--NegLogProb") == 0) {
00256       logProbFlag = true;
00257       negLogProb = -1;
00258       std::cerr << "using negative log-probabilities" << std::endl;
00259     } else if (strcmp(argv[i],"--MinCount") == 0) {
00260       minCount = std::atof( argv[++i] );
00261       std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
00262       minCount -= 0.00001; // account for rounding
00263     } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
00264       minCountHierarchical = std::atof( argv[++i] );
00265       std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
00266       minCountHierarchical -= 0.00001; // account for rounding
00267     } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
00268       crossedNonTerm = true;
00269       std::cerr << "crossed non-term reordering feature" << std::endl;
00270     } else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) {
00271       phraseOrientationPriorsFlag = true;
00272       if (i+1==argc) {
00273         std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl;
00274         exit(1);
00275       }
00276       fileNamePhraseOrientationPriors = argv[++i];
00277       std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl;
00278     } else if (strcmp(argv[i],"--SpanLength") == 0) {
00279       spanLength = true;
00280       std::cerr << "span length feature" << std::endl;
00281     } else if (strcmp(argv[i],"--RuleLength") == 0) {
00282       ruleLength = true;
00283       std::cerr << "rule length feature" << std::endl;
00284     } else if (strcmp(argv[i],"--NonTermContext") == 0) {
00285       nonTermContext = true;
00286       std::cerr << "non-term context" << std::endl;
00287     } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
00288       nonTermContextTarget = true;
00289       std::cerr << "non-term context (target)" << std::endl;
00290     } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
00291       targetConstituentBoundariesFlag = true;
00292       std::cerr << "including target constituent boundaries information" << std::endl;
00293     } else {
00294       featureArgs.push_back(argv[i]);
00295       ++i;
00296       for (; i < argc &&  strncmp(argv[i], "--", 2); ++i) {
00297         featureArgs.push_back(argv[i]);
00298       }
00299       if (i != argc) --i; //roll back, since we found another -- argument
00300     }
00301   }
00302 
00303   MaybeLog maybeLogProb(logProbFlag, negLogProb);
00304 
00305   // configure extra features
00306   if (!inverseFlag) {
00307     featureManager.configure(featureArgs);
00308   }
00309 
00310   // lexical translation table
00311   if (lexFlag) {
00312     lexTable.load( fileNameLex );
00313   }
00314 
00315   // function word list
00316   if (unalignedFWFlag) {
00317     loadFunctionWords( fileNameFunctionWords );
00318   }
00319 
00320   // compute count of counts for Good Turing discounting
00321   if (goodTuringFlag || kneserNeyFlag) {
00322     for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
00323   }
00324 
00325   if (phraseOrientationPriorsFlag) {
00326     loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L);
00327   }
00328 
00329   // sorted phrase extraction file
00330   Moses::InputFileStream extractFile(fileNameExtract);
00331 
00332   if (extractFile.fail()) {
00333     std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
00334     exit(1);
00335   }
00336 
00337   // output file: phrase translation table
00338   std::ostream *phraseTableFile;
00339 
00340   if (fileNamePhraseTable == "-") {
00341     phraseTableFile = &std::cout;
00342   } else {
00343     Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
00344     bool success = outputFile->Open(fileNamePhraseTable);
00345     if (!success) {
00346       std::cerr << "ERROR: could not open file phrase table file "
00347                 << fileNamePhraseTable << std::endl;
00348       exit(1);
00349     }
00350     phraseTableFile = outputFile;
00351   }
00352 
00353   // loop through all extracted phrase translations
00354   std::string line, lastLine;
00355   ExtractionPhrasePair *phrasePair = NULL;
00356   std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
00357   std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
00358 
00359   int tmpSentenceId;
00360   PHRASE *tmpPhraseSource, *tmpPhraseTarget;
00361   ALIGNMENT *tmpTargetToSourceAlignment;
00362   std::string tmpAdditionalPropertiesString;
00363   float tmpCount=0.0f, tmpPcfgSum=0.0f;
00364 
00365   int i=0;
00366   if ( getline(extractFile, line) ) {
00367     ++i;
00368     tmpPhraseSource = new PHRASE();
00369     tmpPhraseTarget = new PHRASE();
00370     tmpTargetToSourceAlignment = new ALIGNMENT();
00371     processLine( std::string(line),
00372                  i, featureManager.includeSentenceId(), tmpSentenceId,
00373                  tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00374                  tmpAdditionalPropertiesString,
00375                  tmpCount, tmpPcfgSum);
00376     phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
00377                                            tmpTargetToSourceAlignment,
00378                                            tmpCount, tmpPcfgSum );
00379     phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00380     featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00381     phrasePairsWithSameSource.push_back( phrasePair );
00382     if ( hierarchicalFlag ) {
00383       phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
00384     }
00385     lastLine = line;
00386   }
00387 
00388   while ( getline(extractFile, line) ) {
00389 
00390     // Print progress dots to stderr.
00391     if ( ++i % 100000 == 0 ) {
00392       std::cerr << "." << std::flush;
00393     }
00394 
00395     // identical to last line? just add count
00396     if (line == lastLine) {
00397       phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
00398       continue;
00399     } else {
00400       lastLine = line;
00401     }
00402 
00403     tmpPhraseSource = new PHRASE();
00404     tmpPhraseTarget = new PHRASE();
00405     tmpTargetToSourceAlignment = new ALIGNMENT();
00406     tmpAdditionalPropertiesString.clear();
00407     processLine( std::string(line),
00408                  i, featureManager.includeSentenceId(), tmpSentenceId,
00409                  tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00410                  tmpAdditionalPropertiesString,
00411                  tmpCount, tmpPcfgSum);
00412 
00413     bool matchesPrevious = false;
00414     bool sourceMatch = true;
00415     bool targetMatch = true;
00416     bool alignmentMatch = true; // be careful with these,
00417     // ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
00418     // once the first of them has been found to have to be set to false
00419 
00420     if ( hierarchicalFlag ) {
00421       for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
00422             iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
00423         if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00424                                sourceMatch, targetMatch, alignmentMatch ) ) {
00425           matchesPrevious = true;
00426           phrasePair = (*iter);
00427           break;
00428         }
00429       }
00430     } else {
00431       if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
00432                                 sourceMatch, targetMatch, alignmentMatch ) ) {
00433         matchesPrevious = true;
00434       }
00435     }
00436 
00437     if ( matchesPrevious ) {
00438       delete tmpPhraseSource;
00439       delete tmpPhraseTarget;
00440       if ( !phrasePair->Add( tmpTargetToSourceAlignment,
00441                              tmpCount, tmpPcfgSum ) ) {
00442         delete tmpTargetToSourceAlignment;
00443       }
00444       phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00445       featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00446     } else {
00447 
00448       if ( !phrasePairsWithSameSource.empty() &&
00449            !sourceMatch ) {
00450         processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
00451         for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00452               iter!=phrasePairsWithSameSource.end(); ++iter) {
00453           delete *iter;
00454         }
00455         phrasePairsWithSameSource.clear();
00456         if ( hierarchicalFlag ) {
00457           phrasePairsWithSameSourceAndTarget.clear();
00458         }
00459       }
00460 
00461       if ( hierarchicalFlag ) {
00462         if ( !phrasePairsWithSameSourceAndTarget.empty() &&
00463              !targetMatch ) {
00464           phrasePairsWithSameSourceAndTarget.clear();
00465         }
00466       }
00467 
00468       phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
00469                                              tmpTargetToSourceAlignment,
00470                                              tmpCount, tmpPcfgSum );
00471       phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
00472       featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
00473       phrasePairsWithSameSource.push_back(phrasePair);
00474 
00475       if ( hierarchicalFlag ) {
00476         phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
00477       }
00478     }
00479 
00480   }
00481 
00482   // We've been printing progress dots to stderr.  End the line.
00483   std::cerr << std::endl;
00484 
00485   processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
00486   for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00487         iter!=phrasePairsWithSameSource.end(); ++iter) {
00488     delete *iter;
00489   }
00490   phrasePairsWithSameSource.clear();
00491 
00492 
00493   phraseTableFile->flush();
00494   if (phraseTableFile != &std::cout) {
00495     delete phraseTableFile;
00496   }
00497 
00498   // output count of count statistics
00499   if (goodTuringFlag || kneserNeyFlag) {
00500     writeCountOfCounts( fileNameCountOfCounts );
00501   }
00502 
00503   // source syntax labels
00504   if (sourceSyntaxLabelsFlag && !inverseFlag) {
00505     writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
00506   }
00507   if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
00508     writeLeftHandSideLabelCounts( sourceLHSCounts,
00509                                   targetLHSAndSourceLHSJointCounts,
00510                                   fileNameLeftHandSideSourceLabelCounts,
00511                                   fileNameLeftHandSideTargetSourceLabelCounts );
00512   }
00513 
00514   // parts-of-speech
00515   if (partsOfSpeechFlag && !inverseFlag) {
00516     writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
00517   }
00518 
00519   // target syntactic preferences labels
00520   if (targetSyntacticPreferencesFlag && !inverseFlag) {
00521     writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
00522     writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
00523                                   ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
00524                                   fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
00525                                   fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
00526   }
00527 }
00528 
00529 
00530 void processLine( std::string line,
00531                   int lineID, bool includeSentenceIdFlag, int &sentenceId,
00532                   PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
00533                   std::string &additionalPropertiesString,
00534                   float &count, float &pcfgSum )
00535 {
00536   size_t foundAdditionalProperties = line.rfind("|||");
00537   foundAdditionalProperties = line.find("{{",foundAdditionalProperties);
00538   if (foundAdditionalProperties != std::string::npos) {
00539     additionalPropertiesString = line.substr(foundAdditionalProperties);
00540     line = line.substr(0,foundAdditionalProperties);
00541   } else {
00542     additionalPropertiesString.clear();
00543   }
00544 
00545   phraseSource->clear();
00546   phraseTarget->clear();
00547   targetToSourceAlignment->clear();
00548 
00549   std::vector<std::string> token;
00550   Moses::Tokenize( token, line );
00551   int item = 1;
00552   for ( size_t j=0; j<token.size(); ++j ) {
00553     if (token[j] == "|||") {
00554       ++item;
00555     } else if (item == 1) { // source phrase
00556       phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
00557     } else if (item == 2) { // target phrase
00558       phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
00559     } else if (item == 3) { // alignment
00560       int s,t;
00561       sscanf(token[j].c_str(), "%d-%d", &s, &t);
00562       if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
00563         std::cerr << "WARNING: phrase pair " << lineID
00564                   << " has alignment point (" << s << ", " << t << ")"
00565                   << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
00566                   << std::endl;
00567       } else {
00568         // first alignment point? -> initialize
00569         if ( targetToSourceAlignment->size() == 0 ) {
00570           size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
00571           targetToSourceAlignment->resize(numberOfTargetSymbols);
00572         }
00573         // add alignment point
00574         targetToSourceAlignment->at(t).insert(s);
00575       }
00576     } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
00577       sscanf(token[j].c_str(), "%d", &sentenceId);
00578     } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
00579       sscanf(token[j].c_str(), "%f", &count);
00580     } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
00581       float pcfgScore = std::atof( token[j].c_str() );
00582       pcfgSum = pcfgScore * count;
00583     }
00584   }
00585 
00586   if ( targetToSourceAlignment->size() == 0 ) {
00587     size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
00588     targetToSourceAlignment->resize(numberOfTargetSymbols);
00589   }
00590 
00591   if (item + (includeSentenceIdFlag?-1:0) == 3) {
00592     count = 1.0;
00593   }
00594   if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
00595     std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
00596   }
00597 
00598 }
00599 
00600 
00601 void writeCountOfCounts( const std::string &fileNameCountOfCounts )
00602 {
00603   // open file
00604   Moses::OutputFileStream countOfCountsFile;
00605   bool success = countOfCountsFile.Open(fileNameCountOfCounts);
00606   if (!success) {
00607     std::cerr << "ERROR: could not open count-of-counts file "
00608               << fileNameCountOfCounts << std::endl;
00609     return;
00610   }
00611 
00612   // Kneser-Ney needs the total number of phrase pairs
00613   countOfCountsFile << totalDistinct << std::endl;
00614 
00615   // write out counts
00616   for(int i=1; i<=COC_MAX; i++) {
00617     countOfCountsFile << countOfCounts[ i ] << std::endl;
00618   }
00619   countOfCountsFile.Close();
00620 }
00621 
00622 
00623 void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
00624                                    const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
00625                                    const std::string &fileNameLeftHandSideSourceLabelCounts,
00626                                    const std::string &fileNameLeftHandSideTargetSourceLabelCounts )
00627 {
00628   // open file
00629   Moses::OutputFileStream leftHandSideSourceLabelCounts;
00630   bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
00631   if (!success) {
00632     std::cerr << "ERROR: could not open left-hand side label counts file "
00633               << fileNameLeftHandSideSourceLabelCounts << std::endl;
00634     return;
00635   }
00636 
00637   // write source left-hand side counts
00638   for (boost::unordered_map<std::string,float>::const_iterator iter=sourceLHSCounts.begin();
00639        iter!=sourceLHSCounts.end(); ++iter) {
00640     leftHandSideSourceLabelCounts << iter->first << " " << iter->second << std::endl;
00641   }
00642 
00643   leftHandSideSourceLabelCounts.Close();
00644 
00645   // open file
00646   Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
00647   success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
00648   if (!success) {
00649     std::cerr << "ERROR: could not open left-hand side label joint counts file "
00650               << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
00651     return;
00652   }
00653 
00654   // write source left-hand side / target left-hand side joint counts
00655   for (boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::const_iterator iter=targetLHSAndSourceLHSJointCounts.begin();
00656        iter!=targetLHSAndSourceLHSJointCounts.end(); ++iter) {
00657     for (boost::unordered_map<std::string,float>::const_iterator iter2=(iter->second)->begin();
00658          iter2!=(iter->second)->end(); ++iter2) {
00659       leftHandSideTargetSourceLabelCounts << iter->first << " "<< iter2->first << " " << iter2->second << std::endl;
00660     }
00661   }
00662 
00663   leftHandSideTargetSourceLabelCounts.Close();
00664 }
00665 
00666 
00667 void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName )
00668 {
00669   // open file
00670   Moses::OutputFileStream out;
00671   bool success = out.Open(fileName);
00672   if (!success) {
00673     std::cerr << "ERROR: could not open file "
00674               << fileName << " for writing" << std::endl;
00675     return;
00676   }
00677 
00678   for (std::set<std::string>::const_iterator iter=labelSet.begin();
00679        iter!=labelSet.end(); ++iter) {
00680     out << *iter << std::endl;
00681   }
00682 
00683   out.Close();
00684 }
00685 
00686 
00687 void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
00688                          const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
00689 {
00690   if (phrasePairsWithSameSource.size() == 0) {
00691     return;
00692   }
00693 
00694   float totalSource = 0;
00695 
00696   //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
00697 
00698   // loop through phrase pairs
00699   for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00700         iter!=phrasePairsWithSameSource.end(); ++iter) {
00701     // add to total count
00702     totalSource += (*iter)->GetCount();
00703   }
00704 
00705   // output the distinct phrase pairs, one at a time
00706   for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
00707         iter!=phrasePairsWithSameSource.end(); ++iter) {
00708     // add to total count
00709     outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
00710   }
00711 }
00712 
00713 void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
00714                       float totalCount, int distinctCount,
00715                       std::ostream &phraseTableFile,
00716                       const ScoreFeatureManager& featureManager,
00717                       const MaybeLog& maybeLogProb )
00718 {
00719   assert(phrasePair.IsValid());
00720 
00721   const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
00722   float count = phrasePair.GetCount();
00723 
00724   std::map< std::string, float > domainCount;
00725 
00726   // collect count of count statistics
00727   if (goodTuringFlag || kneserNeyFlag) {
00728     totalDistinct++;
00729     int countInt = count + 0.99999;
00730     if ((countInt <= COC_MAX) &&
00731         (countInt > 0))
00732       countOfCounts[ countInt ]++;
00733   }
00734 
00735   // output phrases
00736   const PHRASE *phraseSource = phrasePair.GetSource();
00737   const PHRASE *phraseTarget = phrasePair.GetTarget();
00738 
00739   // do not output if count below threshold
00740   if (count < minCount) {
00741     return;
00742   }
00743 
00744   // do not output if hierarchical and count below threshold
00745   if (hierarchicalFlag && count < minCountHierarchical) {
00746     for(size_t j=0; j<phraseSource->size()-1; ++j) {
00747       if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00748         return;
00749     }
00750   }
00751 
00752   // compute PCFG score
00753   float pcfgScore = 0;
00754   if (pcfgFlag && !inverseFlag) {
00755     pcfgScore = phrasePair.GetPcfgScore() / count;
00756   }
00757 
00758   // source phrase (unless inverse)
00759   if (!inverseFlag) {
00760     printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00761     phraseTableFile << " ||| ";
00762   }
00763 
00764   // target phrase
00765   printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00766   phraseTableFile << " ||| ";
00767 
00768   // source phrase (if inverse)
00769   if (inverseFlag) {
00770     printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
00771     phraseTableFile << " ||| ";
00772   }
00773 
00774   // alignment
00775   if ( hierarchicalFlag ) {
00776     // always output alignment if hiero style
00777     assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
00778     std::vector<std::string> alignment;
00779     for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
00780       if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
00781         if ( bestAlignmentT2S->at(j).size() != 1 ) {
00782           std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
00783           phraseTableFile.flush();
00784           assert(bestAlignmentT2S->at(j).size() == 1);
00785         }
00786         size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
00787         //phraseTableFile << sourcePos << "-" << j << " ";
00788         std::stringstream point;
00789         point << sourcePos << "-" << j;
00790         alignment.push_back(point.str());
00791       } else {
00792         for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
00793               setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
00794           size_t sourcePos = *setIter;
00795           std::stringstream point;
00796           point << sourcePos << "-" << j;
00797           alignment.push_back(point.str());
00798         }
00799       }
00800     }
00801     // now print all alignments, sorted by source index
00802     sort(alignment.begin(), alignment.end());
00803     for (size_t i = 0; i < alignment.size(); ++i) {
00804       phraseTableFile << alignment[i] << " ";
00805     }
00806   } else if ( !inverseFlag && wordAlignmentFlag) {
00807     // alignment info in pb model
00808     for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
00809       for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
00810             setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
00811         size_t sourcePos = *setIter;
00812         phraseTableFile << sourcePos << "-" << j << " ";
00813       }
00814     }
00815   }
00816 
00817   phraseTableFile << " ||| ";
00818 
00819   // lexical translation probability
00820   if (lexFlag) {
00821     double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
00822     phraseTableFile << maybeLogProb( lexScore );
00823   }
00824 
00825   // unaligned word penalty
00826   if (unalignedFlag) {
00827     double penalty = computeUnalignedPenalty( bestAlignmentT2S );
00828     phraseTableFile << " " << maybeLogProb( penalty );
00829   }
00830 
00831   // unaligned function word penalty
00832   if (unalignedFWFlag) {
00833     double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
00834     phraseTableFile << " " << maybeLogProb( penalty );
00835   }
00836 
00837   if (crossedNonTerm && !inverseFlag) {
00838     phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
00839   }
00840 
00841   // target-side PCFG score
00842   if (pcfgFlag && !inverseFlag) {
00843     phraseTableFile << " " << maybeLogProb( pcfgScore );
00844   }
00845 
00846   // extra features
00847   ScoreFeatureContext context(phrasePair, maybeLogProb);
00848   std::vector<float> extraDense;
00849   std::map<std::string,float> extraSparse;
00850   featureManager.addFeatures(context, extraDense, extraSparse);
00851   for (size_t i = 0; i < extraDense.size(); ++i) {
00852     phraseTableFile << " " << extraDense[i];
00853   }
00854 
00855   for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
00856        i != extraSparse.end(); ++i) {
00857     phraseTableFile << " " << i->first << " " << i->second;
00858   }
00859 
00860   // counts
00861   phraseTableFile << " ||| " << totalCount << " " << count;
00862   if (kneserNeyFlag)
00863     phraseTableFile << " " << distinctCount;
00864 
00865   phraseTableFile << " |||";
00866 
00867   // tree fragments
00868   if (treeFragmentsFlag && !inverseFlag) {
00869     const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
00870     if (bestTreeFragment) {
00871       phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
00872     }
00873   }
00874 
00875   // parts-of-speech
00876   if (partsOfSpeechFlag && !inverseFlag) {
00877     phrasePair.UpdateVocabularyFromValueTokens("POS", partsOfSpeechSet);
00878     const std::string *bestPartOfSpeech = phrasePair.FindBestPropertyValue("POS");
00879     if (bestPartOfSpeech) {
00880       phraseTableFile << " {{POS " << *bestPartOfSpeech << "}}";
00881     }
00882   }
00883 
00884   // syntax labels
00885   if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
00886     unsigned nNTs = 1;
00887     for(size_t j=0; j<phraseSource->size()-1; ++j) {
00888       if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00889         ++nNTs;
00890     }
00891     // source syntax labels
00892     if (sourceSyntaxLabelsFlag) {
00893       std::string sourceLabelCounts;
00894       sourceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("SourceLabels",
00895                           sourceLabelSet,
00896                           sourceLHSCounts,
00897                           targetLHSAndSourceLHSJointCounts,
00898                           vcbT);
00899       if ( !sourceLabelCounts.empty() ) {
00900         phraseTableFile << " {{SourceLabels "
00901                         << phraseSource->size() // for convenience: number of symbols in this rule (incl. left hand side NT)
00902                         << " "
00903                         << count // rule count
00904                         << sourceLabelCounts
00905                         << "}}";
00906       }
00907     }
00908     // target syntactic preferences labels
00909     if (targetSyntacticPreferencesFlag) {
00910       std::string targetSyntacticPreferencesLabelCounts;
00911       targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
00912                                               targetSyntacticPreferencesLabelSet,
00913                                               targetSyntacticPreferencesLHSCounts,
00914                                               ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
00915                                               vcbT);
00916       if (!targetSyntacticPreferencesLabelCounts.empty()) {
00917         phraseTableFile << " {{TargetPreferences "
00918                         << nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
00919                         << " "
00920                         << count // rule count
00921                         << targetSyntacticPreferencesLabelCounts
00922                         << "}}";
00923       }
00924     }
00925   }
00926 
00927   // phrase orientation
00928   if (phraseOrientationFlag && !inverseFlag) {
00929     phraseTableFile << " {{Orientation ";
00930     phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile);
00931     phraseTableFile << "}}";
00932   }
00933 
00934   if (spanLength && !inverseFlag) {
00935     std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
00936     if (!propValue.empty()) {
00937       phraseTableFile << " {{SpanLength " << propValue << "}}";
00938     }
00939   }
00940 
00941   if (ruleLength && !inverseFlag) {
00942     std::string propValue = phrasePair.CollectAllPropertyValues("RuleLength");
00943     if (!propValue.empty()) {
00944       phraseTableFile << " {{RuleLength " << propValue << "}}";
00945     }
00946   }
00947 
00948   if (nonTermContext && !inverseFlag) {
00949     std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
00950     if (!propValue.empty() && propValue.size() < 50000) {
00951       size_t nNTs = NumNonTerminal(phraseSource);
00952       phraseTableFile << " {{NonTermContext " << nNTs << " " << propValue << "}}";
00953     }
00954   }
00955 
00956   if (nonTermContextTarget && !inverseFlag) {
00957     std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContextTarget");
00958     if (!propValue.empty() && propValue.size() < 50000) {
00959       size_t nNTs = NumNonTerminal(phraseSource);
00960       phraseTableFile << " {{NonTermContextTarget " << nNTs << " " << propValue << "}}";
00961     }
00962   }
00963 
00964   // target constituent boundaries
00965   if (targetConstituentBoundariesFlag && !inverseFlag) {
00966     const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
00967     if (!targetConstituentBoundariesLeftValues.empty()) {
00968       phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
00969     }
00970     const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
00971     if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
00972       phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
00973     }
00974   }
00975 
00976   phraseTableFile << std::endl;
00977 }
00978 
00979 size_t NumNonTerminal(const PHRASE *phraseSource)
00980 {
00981   size_t nNTs = 0;
00982   for(size_t j=0; j<phraseSource->size()-1; ++j) {
00983     if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
00984       ++nNTs;
00985   }
00986   return nNTs;
00987 }
00988 
00989 void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
00990                            std::vector<float> &orientationClassPriorsL2R,
00991                            std::vector<float> &orientationClassPriorsR2L)
00992 {
00993   assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
00994 
00995   std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
00996   Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
00997   if (inFile.fail()) {
00998     std::cerr << " - ERROR: could not open file" << std::endl;
00999     exit(1);
01000   }
01001 
01002   std::string line;
01003   size_t linesRead = 0;
01004   float l2rSum = 0;
01005   float r2lSum = 0;
01006   while (getline(inFile, line)) {
01007     std::istringstream tokenizer(line);
01008     std::string key;
01009     tokenizer >> key;
01010 
01011     bool l2rFlag = false;
01012     bool r2lFlag = false;
01013     if (starts_with(key, "L2R_")) {
01014       l2rFlag = true;
01015     }
01016     if (starts_with(key, "R2L_")) {
01017       r2lFlag = true;
01018     }
01019     if (!l2rFlag && !r2lFlag) {
01020       std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
01021     }
01022     key.erase(0,4);
01023 
01024     int orientationClassId = -1;
01025     if (!key.compare("mono")) {
01026       orientationClassId = 0;
01027     }
01028     if (!key.compare("swap")) {
01029       orientationClassId = 1;
01030     }
01031     if (!key.compare("dleft")) {
01032       orientationClassId = 2;
01033     }
01034     if (!key.compare("dright")) {
01035       orientationClassId = 3;
01036     }
01037     if (orientationClassId == -1) {
01038       std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
01039     }
01040 
01041     float count;
01042     tokenizer >> count;
01043 
01044     if (l2rFlag) {
01045       orientationClassPriorsL2R[orientationClassId] += count;
01046       l2rSum += count;
01047     }
01048     if (r2lFlag) {
01049       orientationClassPriorsR2L[orientationClassId] += count;
01050       r2lSum += count;
01051     }
01052 
01053     ++linesRead;
01054   }
01055 
01056   // normalization: return prior probabilities, not counts
01057   if (l2rSum != 0) {
01058     for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin();
01059          orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) {
01060       *orientationClassPriorsL2RIt /= l2rSum;
01061     }
01062   }
01063   if (r2lSum != 0) {
01064     for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin();
01065          orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) {
01066       *orientationClassPriorsR2LIt /= r2lSum;
01067     }
01068   }
01069 
01070   std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
01071   inFile.Close();
01072 }
01073 
01074 
01075 
01076 bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
01077 {
01078   for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
01079     if (currTarget == targetPos) {
01080       // skip
01081     } else {
01082       const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
01083       for (std::set<size_t>::const_iterator iter = sourceSet.begin();
01084            iter != sourceSet.end(); ++iter) {
01085         size_t currSource = *iter;
01086 
01087         if ((currTarget < targetPos && currSource > sourcePos)
01088             || (currTarget > targetPos && currSource < sourcePos)
01089            ) {
01090           return true;
01091         }
01092       }
01093 
01094     }
01095   }
01096 
01097   return false;
01098 }
01099 
01100 int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01101 {
01102   assert(phraseTarget->size() >= alignmentTargetToSource->size() );
01103 
01104   for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
01105 
01106     if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
01107       const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
01108       assert( alignmentPoints.size() == 1 );
01109       size_t sourcePos = *alignmentPoints.begin();
01110       bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
01111       if (ret)
01112         return 1;
01113     }
01114   }
01115 
01116   return 0;
01117 }
01118 
01119 
01120 double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
01121 {
01122   // unaligned word counter
01123   double unaligned = 1.0;
01124   // only checking target words - source words are caught when computing inverse
01125   for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
01126     const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01127     if (srcIndices.empty()) {
01128       unaligned *= 2.718;
01129     }
01130   }
01131   return unaligned;
01132 }
01133 
01134 
01135 double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01136 {
01137   // unaligned word counter
01138   double unaligned = 1.0;
01139   // only checking target words - source words are caught when computing inverse
01140   for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
01141     const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01142     if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
01143       unaligned *= 2.718;
01144     }
01145   }
01146   return unaligned;
01147 }
01148 
01149 void loadFunctionWords( const std::string &fileName )
01150 {
01151   std::cerr << "Loading function word list from " << fileName;
01152   Moses::InputFileStream inFile(fileName);
01153   if (inFile.fail()) {
01154     std::cerr << " - ERROR: could not open file" << std::endl;
01155     exit(1);
01156   }
01157 
01158   std::string line;
01159   while(getline(inFile, line)) {
01160     std::vector<std::string> token;
01161     Moses::Tokenize( token, line );
01162     if (token.size() > 0)
01163       functionWordList.insert( token[0] );
01164   }
01165 
01166   std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
01167   inFile.Close();
01168 }
01169 
01170 
01171 double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
01172 {
01173   // lexical translation probability
01174   double lexScore = 1.0;
01175   int null = vcbS.getWordID("NULL");
01176   // all target words have to be explained
01177   for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
01178     const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
01179     if (srcIndices.empty()) {
01180       // explain unaligned word by NULL
01181       lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
01182     } else {
01183       // go through all the aligned words to compute average
01184       double thisWordScore = 0;
01185       for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
01186         thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
01187       }
01188       lexScore *= thisWordScore / (double)srcIndices.size();
01189     }
01190   }
01191   return lexScore;
01192 }
01193 
01194 
01195 void LexicalTable::load( const std::string &fileName )
01196 {
01197   std::cerr << "Loading lexical translation table from " << fileName;
01198   Moses::InputFileStream inFile(fileName);
01199   if (inFile.fail()) {
01200     std::cerr << " - ERROR: could not open file" << std::endl;
01201     exit(1);
01202   }
01203 
01204   std::string line;
01205   int i=0;
01206   while(getline(inFile, line)) {
01207     i++;
01208     if (i%100000 == 0) std::cerr << "." << std::flush;
01209 
01210     std::vector<std::string> token;
01211     Moses::Tokenize( token, line );
01212     if (token.size() != 3) {
01213       std::cerr << "line " << i << " in " << fileName
01214                 << " has wrong number of tokens, skipping:" << std::endl
01215                 << token.size() << " " << token[0] << " " << line << std::endl;
01216       continue;
01217     }
01218 
01219     double prob = std::atof( token[2].c_str() );
01220     WORD_ID wordT = vcbT.storeIfNew( token[0] );
01221     WORD_ID wordS = vcbS.storeIfNew( token[1] );
01222     ltable[ wordS ][ wordT ] = prob;
01223   }
01224   std::cerr << std::endl;
01225 }
01226 
01227 
01228 void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01229                        const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
01230 {
01231   // get corresponding target non-terminal and output pair
01232   ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
01233   invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
01234   // output source symbols, except root, in rule table format
01235   for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
01236     const std::string &word = vcbS.getWord(phraseSource->at(i));
01237     if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
01238       out << word << " ";
01239       continue;
01240     }
01241     const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
01242     assert(alignmentPoints.size() == 1);
01243     size_t j = *(alignmentPoints.begin());
01244     if (inverseFlag) {
01245       out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
01246     } else {
01247       out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
01248     }
01249   }
01250   // output source root symbol
01251   if (conditionOnTargetLhsFlag && !inverseFlag) {
01252     out << "[X]";
01253   } else {
01254     out << vcbS.getWord(phraseSource->back());
01255   }
01256   delete sourceToTargetAlignment;
01257 }
01258 
01259 
01260 void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01261                        const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
01262 {
01263   // output target symbols, except root, in rule table format
01264   for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
01265     const std::string &word = vcbT.getWord(phraseTarget->at(i));
01266     if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
01267       out << word << " ";
01268       continue;
01269     }
01270     // get corresponding source non-terminal and output pair
01271     std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
01272     assert(alignmentPoints.size() == 1);
01273     int j = *(alignmentPoints.begin());
01274     if (inverseFlag) {
01275       out << word << vcbS.getWord(phraseSource->at(j)) << " ";
01276     } else {
01277       out << vcbS.getWord(phraseSource->at(j)) << word << " ";
01278     }
01279   }
01280   // output target root symbol
01281   if (conditionOnTargetLhsFlag) {
01282     if (inverseFlag) {
01283       out << "[X]";
01284     } else {
01285       out << vcbS.getWord(phraseSource->back());
01286     }
01287   } else {
01288     out << vcbT.getWord(phraseTarget->back());
01289   }
01290 }
01291 
01292 
01293 void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
01294                      const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment)
01295 {
01296 // typedef std::vector< std::set<size_t> > ALIGNMENT;
01297 
01298   outSourceToTargetAlignment->clear();
01299   size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
01300   outSourceToTargetAlignment->resize(numberOfSourceSymbols);
01301   // add alignment point
01302   for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
01303     for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
01304           setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
01305       size_t sourcePosition = *setIter;
01306       outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
01307     }
01308   }
01309 }
01310