Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/statistics-main.cpp Source File

00001 // $Id$
00002 // vim:tabstop=2
00003 
00004 #include <sstream>
00005 #include <cstdio>
00006 #include <iostream>
00007 #include <fstream>
00008 #include <vector>
00009 #include <string>
00010 #include <cstdlib>
00011 #include <cassert>
00012 #include <ctime>
00013 
00014 #include "AlignmentPhrase.h"
00015 #include "tables-core.h"
00016 #include "InputFileStream.h"
00017 #include "util/tokenize.hh"
00018 
00019 using namespace std;
00020 using namespace MosesTraining;
00021 
00022 namespace MosesTraining
00023 {
00024 
00025 class PhraseAlignment
00026 {
00027 public:
00028   int english, foreign;
00029   vector< vector<size_t> > alignedToE;
00030   vector< vector<size_t> > alignedToF;
00031 
00032   bool create( const char*, int );
00033   void clear();
00034   bool equals( const PhraseAlignment& );
00035 };
00036 
00037 class LexicalTable
00038 {
00039 public:
00040   map< WORD_ID, map< WORD_ID, double > > ltable;
00041   void load( const string &);
00042 };
00043 
00044 }
00045 
00046 void processPhrasePairs( vector< PhraseAlignment > & );
00047 
00048 ofstream phraseTableFile;
00049 
00050 Vocabulary vcbE;
00051 Vocabulary vcbF;
00052 LexicalTable lexTable;
00053 PhraseTable phraseTableE;
00054 PhraseTable phraseTableF;
00055 bool inverseFlag;
00056 int phrasePairBase = 0; // only used for "proper" conditioning
00057 
00058 int main(int argc, char* argv[])
00059 {
00060   cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
00061        << "modifying PhraseScore v1.4 written by Philipp Koehn\n"
00062        << "It computes statistics for extracted phrase pairs\n"
00063        << "if (direct):\n"
00064        << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
00065        << "if (inverse)\n"
00066        << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
00067 
00068   if (argc != 4 && argc != 5) {
00069     cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
00070     exit(1);
00071   }
00072   char* &fileNameExtract = argv[1];
00073   char* &fileNameLex = argv[2];
00074   char* &fileNamePhraseTable = argv[3];
00075   inverseFlag = false;
00076   if (argc > 4) {
00077     inverseFlag = true;
00078     cerr << "using inverse mode\n";
00079   }
00080 
00081   // lexical translation table
00082   lexTable.load( fileNameLex );
00083 
00084   // sorted phrase extraction file
00085   Moses::InputFileStream extractFile(fileNameExtract);
00086 
00087   if (extractFile.fail()) {
00088     cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
00089     exit(1);
00090   }
00091   istream &extractFileP = extractFile;
00092 
00093   // output file: phrase translation table
00094   phraseTableFile.open(fileNamePhraseTable);
00095   if (phraseTableFile.fail()) {
00096     cerr << "ERROR: could not open file phrase table file "
00097          << fileNamePhraseTable << endl;
00098     exit(1);
00099   }
00100 
00101   // loop through all extracted phrase translations
00102   int lastForeign = -1;
00103   vector< PhraseAlignment > phrasePairsWithSameF;
00104   int i=0;
00105 
00106   string line;
00107   while(getline(extractFileP, line)) {
00108     if (extractFileP.eof()) break;
00109     if (++i % 100000 == 0) cerr << "." << flush;
00110 
00111     PhraseAlignment phrasePair;
00112     bool isPhrasePair = phrasePair.create( line.c_str(), i );
00113     if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
00114       processPhrasePairs( phrasePairsWithSameF );
00115       for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
00116         phrasePairsWithSameF[j].clear();
00117       phrasePairsWithSameF.clear();
00118       phraseTableE.clear();
00119       phraseTableF.clear();
00120       phrasePair.clear(); // process line again, since phrase tables flushed
00121       phrasePair.create( line.c_str(), i );
00122       phrasePairBase = 0;
00123     }
00124     lastForeign = phrasePair.foreign;
00125     if (isPhrasePair)
00126       phrasePairsWithSameF.push_back( phrasePair );
00127     else
00128       phrasePairBase++;
00129   }
00130   processPhrasePairs( phrasePairsWithSameF );
00131   phraseTableFile.close();
00132 }
00133 
00134 void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
00135 {
00136   if (phrasePair.size() == 0) return;
00137   map<int, int> countE;
00138   map<int, int> alignmentE;
00139   int totalCount = 0;
00140   int currentCount = 0;
00141   int maxSameCount = 0;
00142   int maxSame = -1;
00143   int old = -1;
00144   for(size_t i=0; i<phrasePair.size(); i++) {
00145     if (i>0) {
00146       if (phrasePair[old].english == phrasePair[i].english) {
00147         if (! phrasePair[i].equals( phrasePair[old] )) {
00148           if (currentCount > maxSameCount) {
00149             maxSameCount = currentCount;
00150             maxSame = i-1;
00151           }
00152           currentCount = 0;
00153         }
00154       } else {
00155         // wrap up old E
00156         if (currentCount > maxSameCount) {
00157           maxSameCount = currentCount;
00158           maxSame = i-1;
00159         }
00160 
00161         alignmentE[ phrasePair[old].english ] = maxSame;
00162         //      if (maxSameCount != totalCount)
00163         //  cout << "max count is " << maxSameCount << "/" << totalCount << endl;
00164 
00165         // get ready for new E
00166         totalCount = 0;
00167         currentCount = 0;
00168         maxSameCount = 0;
00169         maxSame = -1;
00170       }
00171     }
00172     countE[ phrasePair[i].english ]++;
00173     old = i;
00174     currentCount++;
00175     totalCount++;
00176   }
00177 
00178   // wrap up old E
00179   if (currentCount > maxSameCount) {
00180     maxSameCount = currentCount;
00181     maxSame = phrasePair.size()-1;
00182   }
00183   alignmentE[ phrasePair[old].english ] = maxSame;
00184   //  if (maxSameCount != totalCount)
00185   //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;
00186 
00187   // output table
00188   typedef map< int, int >::iterator II;
00189   PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
00190   size_t index = 0;
00191   for(II i = countE.begin(); i != countE.end(); i++) {
00192     //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
00193     //cerr << index << endl;
00194 
00195     // foreign phrase (unless inverse)
00196     if (! inverseFlag) {
00197       for(size_t j=0; j<phraseF.size(); j++) {
00198         phraseTableFile << vcbF.getWord( phraseF[j] );
00199         phraseTableFile << " ";
00200       }
00201       phraseTableFile << "||| ";
00202     }
00203 
00204     // english phrase
00205     PHRASE phraseE = phraseTableE.getPhrase( i->first );
00206     for(size_t j=0; j<phraseE.size(); j++) {
00207       phraseTableFile << vcbE.getWord( phraseE[j] );
00208       phraseTableFile << " ";
00209     }
00210     phraseTableFile << "||| ";
00211 
00212     // foreign phrase (if inverse)
00213     if (inverseFlag) {
00214       for(size_t j=0; j<phraseF.size(); j++) {
00215         phraseTableFile << vcbF.getWord( phraseF[j] );
00216         phraseTableFile << " ";
00217       }
00218       phraseTableFile << "||| ";
00219     }
00220 
00221     // phrase pair frequency
00222     phraseTableFile << i->second;
00223 
00224     //source phrase pair frequency
00225     phraseTableFile << " " << phrasePair.size();
00226 
00227     // source phrase length
00228     phraseTableFile     << " " << phraseF.size();
00229 
00230     // target phrase length
00231     phraseTableFile     << " " << phraseE.size();
00232 
00233     phraseTableFile << endl;
00234 
00235     index += i->second;
00236   }
00237 }
00238 
00239 bool PhraseAlignment::create(const char line[], int lineID )
00240 {
00241   const vector< string > token = util::tokenize( line );
00242   int item = 1;
00243   PHRASE phraseF, phraseE;
00244   for (size_t j=0; j<token.size(); j++) {
00245     if (token[j] == "|||") item++;
00246     else {
00247       if (item == 1)
00248         phraseF.push_back( vcbF.storeIfNew( token[j] ) );
00249       else if (item == 2)
00250         phraseE.push_back( vcbE.storeIfNew( token[j] ) );
00251       else if (item == 3) {
00252         int e,f;
00253         sscanf(token[j].c_str(), "%d-%d", &f, &e);
00254         if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) {
00255           cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n";
00256         } else {
00257           if (alignedToE.size() == 0) {
00258             vector< size_t > dummy;
00259             for(size_t i=0; i<phraseE.size(); i++)
00260               alignedToE.push_back( dummy );
00261             for(size_t i=0; i<phraseF.size(); i++)
00262               alignedToF.push_back( dummy );
00263             foreign = phraseTableF.storeIfNew( phraseF );
00264             english = phraseTableE.storeIfNew( phraseE );
00265           }
00266           alignedToE[e].push_back( f );
00267           alignedToF[f].push_back( e );
00268         }
00269       }
00270     }
00271   }
00272   return (item>2); // real phrase pair, not just foreign phrase
00273 }
00274 
00275 void PhraseAlignment::clear()
00276 {
00277   for(size_t i=0; i<alignedToE.size(); i++)
00278     alignedToE[i].clear();
00279   for(size_t i=0; i<alignedToF.size(); i++)
00280     alignedToF[i].clear();
00281   alignedToE.clear();
00282   alignedToF.clear();
00283 }
00284 
00285 bool PhraseAlignment::equals( const PhraseAlignment& other )
00286 {
00287   if (this == &other) return true;
00288   if (other.english != english) return false;
00289   if (other.foreign != foreign) return false;
00290   PHRASE phraseE = phraseTableE.getPhrase( english );
00291   PHRASE phraseF = phraseTableF.getPhrase( foreign );
00292   for(size_t i=0; i<phraseE.size(); i++) {
00293     if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
00294     for(size_t j=0; j<alignedToE[i].size(); j++) {
00295       if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
00296     }
00297   }
00298   for(size_t i=0; i<phraseF.size(); i++) {
00299     if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
00300     for(size_t j=0; j<alignedToF[i].size(); j++) {
00301       if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
00302     }
00303   }
00304   return true;
00305 }
00306 
00307 void LexicalTable::load( const string &filePath )
00308 {
00309   cerr << "Loading lexical translation table from " << filePath;
00310   ifstream inFile;
00311   inFile.open(filePath.c_str());
00312   if (inFile.fail()) {
00313     cerr << " - ERROR: could not open file\n";
00314     exit(1);
00315   }
00316   istream *inFileP = &inFile;
00317 
00318   string line;
00319 
00320   int i=0;
00321   while(getline(*inFileP, line)) {
00322     i++;
00323     if (i%100000 == 0) cerr << "." << flush;
00324 
00325     const vector<string> token = util::tokenize( line );
00326     if (token.size() != 3) {
00327       cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
00328            token.size() << " " << token[0] << " " << line << endl;
00329       continue;
00330     }
00331 
00332     double prob = atof( token[2].c_str() );
00333     WORD_ID wordE = vcbE.storeIfNew( token[0] );
00334     WORD_ID wordF = vcbF.storeIfNew( token[1] );
00335     ltable[ wordF ][ wordE ] = prob;
00336   }
00337   cerr << endl;
00338 }