00001
00002
00003
00004 #include <sstream>
00005 #include <cstdio>
00006 #include <iostream>
00007 #include <fstream>
00008 #include <vector>
00009 #include <string>
00010 #include <cstdlib>
00011 #include <cassert>
00012 #include <ctime>
00013
00014 #include "AlignmentPhrase.h"
00015 #include "tables-core.h"
00016 #include "InputFileStream.h"
00017 #include "util/tokenize.hh"
00018
00019 using namespace std;
00020 using namespace MosesTraining;
00021
00022 namespace MosesTraining
00023 {
00024
00025 class PhraseAlignment
00026 {
00027 public:
00028 int english, foreign;
00029 vector< vector<size_t> > alignedToE;
00030 vector< vector<size_t> > alignedToF;
00031
00032 bool create( const char*, int );
00033 void clear();
00034 bool equals( const PhraseAlignment& );
00035 };
00036
00037 class LexicalTable
00038 {
00039 public:
00040 map< WORD_ID, map< WORD_ID, double > > ltable;
00041 void load( const string &);
00042 };
00043
00044 }
00045
00046 void processPhrasePairs( vector< PhraseAlignment > & );
00047
00048 ofstream phraseTableFile;
00049
00050 Vocabulary vcbE;
00051 Vocabulary vcbF;
00052 LexicalTable lexTable;
00053 PhraseTable phraseTableE;
00054 PhraseTable phraseTableF;
00055 bool inverseFlag;
00056 int phrasePairBase = 0;
00057
00058 int main(int argc, char* argv[])
00059 {
00060 cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
00061 << "modifying PhraseScore v1.4 written by Philipp Koehn\n"
00062 << "It computes statistics for extracted phrase pairs\n"
00063 << "if (direct):\n"
00064 << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
00065 << "if (inverse)\n"
00066 << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
00067
00068 if (argc != 4 && argc != 5) {
00069 cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
00070 exit(1);
00071 }
00072 char* &fileNameExtract = argv[1];
00073 char* &fileNameLex = argv[2];
00074 char* &fileNamePhraseTable = argv[3];
00075 inverseFlag = false;
00076 if (argc > 4) {
00077 inverseFlag = true;
00078 cerr << "using inverse mode\n";
00079 }
00080
00081
00082 lexTable.load( fileNameLex );
00083
00084
00085 Moses::InputFileStream extractFile(fileNameExtract);
00086
00087 if (extractFile.fail()) {
00088 cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
00089 exit(1);
00090 }
00091 istream &extractFileP = extractFile;
00092
00093
00094 phraseTableFile.open(fileNamePhraseTable);
00095 if (phraseTableFile.fail()) {
00096 cerr << "ERROR: could not open file phrase table file "
00097 << fileNamePhraseTable << endl;
00098 exit(1);
00099 }
00100
00101
00102 int lastForeign = -1;
00103 vector< PhraseAlignment > phrasePairsWithSameF;
00104 int i=0;
00105
00106 string line;
00107 while(getline(extractFileP, line)) {
00108 if (extractFileP.eof()) break;
00109 if (++i % 100000 == 0) cerr << "." << flush;
00110
00111 PhraseAlignment phrasePair;
00112 bool isPhrasePair = phrasePair.create( line.c_str(), i );
00113 if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
00114 processPhrasePairs( phrasePairsWithSameF );
00115 for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
00116 phrasePairsWithSameF[j].clear();
00117 phrasePairsWithSameF.clear();
00118 phraseTableE.clear();
00119 phraseTableF.clear();
00120 phrasePair.clear();
00121 phrasePair.create( line.c_str(), i );
00122 phrasePairBase = 0;
00123 }
00124 lastForeign = phrasePair.foreign;
00125 if (isPhrasePair)
00126 phrasePairsWithSameF.push_back( phrasePair );
00127 else
00128 phrasePairBase++;
00129 }
00130 processPhrasePairs( phrasePairsWithSameF );
00131 phraseTableFile.close();
00132 }
00133
00134 void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
00135 {
00136 if (phrasePair.size() == 0) return;
00137 map<int, int> countE;
00138 map<int, int> alignmentE;
00139 int totalCount = 0;
00140 int currentCount = 0;
00141 int maxSameCount = 0;
00142 int maxSame = -1;
00143 int old = -1;
00144 for(size_t i=0; i<phrasePair.size(); i++) {
00145 if (i>0) {
00146 if (phrasePair[old].english == phrasePair[i].english) {
00147 if (! phrasePair[i].equals( phrasePair[old] )) {
00148 if (currentCount > maxSameCount) {
00149 maxSameCount = currentCount;
00150 maxSame = i-1;
00151 }
00152 currentCount = 0;
00153 }
00154 } else {
00155
00156 if (currentCount > maxSameCount) {
00157 maxSameCount = currentCount;
00158 maxSame = i-1;
00159 }
00160
00161 alignmentE[ phrasePair[old].english ] = maxSame;
00162
00163
00164
00165
00166 totalCount = 0;
00167 currentCount = 0;
00168 maxSameCount = 0;
00169 maxSame = -1;
00170 }
00171 }
00172 countE[ phrasePair[i].english ]++;
00173 old = i;
00174 currentCount++;
00175 totalCount++;
00176 }
00177
00178
00179 if (currentCount > maxSameCount) {
00180 maxSameCount = currentCount;
00181 maxSame = phrasePair.size()-1;
00182 }
00183 alignmentE[ phrasePair[old].english ] = maxSame;
00184
00185
00186
00187
00188 typedef map< int, int >::iterator II;
00189 PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
00190 size_t index = 0;
00191 for(II i = countE.begin(); i != countE.end(); i++) {
00192
00193
00194
00195
00196 if (! inverseFlag) {
00197 for(size_t j=0; j<phraseF.size(); j++) {
00198 phraseTableFile << vcbF.getWord( phraseF[j] );
00199 phraseTableFile << " ";
00200 }
00201 phraseTableFile << "||| ";
00202 }
00203
00204
00205 PHRASE phraseE = phraseTableE.getPhrase( i->first );
00206 for(size_t j=0; j<phraseE.size(); j++) {
00207 phraseTableFile << vcbE.getWord( phraseE[j] );
00208 phraseTableFile << " ";
00209 }
00210 phraseTableFile << "||| ";
00211
00212
00213 if (inverseFlag) {
00214 for(size_t j=0; j<phraseF.size(); j++) {
00215 phraseTableFile << vcbF.getWord( phraseF[j] );
00216 phraseTableFile << " ";
00217 }
00218 phraseTableFile << "||| ";
00219 }
00220
00221
00222 phraseTableFile << i->second;
00223
00224
00225 phraseTableFile << " " << phrasePair.size();
00226
00227
00228 phraseTableFile << " " << phraseF.size();
00229
00230
00231 phraseTableFile << " " << phraseE.size();
00232
00233 phraseTableFile << endl;
00234
00235 index += i->second;
00236 }
00237 }
00238
00239 bool PhraseAlignment::create(const char line[], int lineID )
00240 {
00241 const vector< string > token = util::tokenize( line );
00242 int item = 1;
00243 PHRASE phraseF, phraseE;
00244 for (size_t j=0; j<token.size(); j++) {
00245 if (token[j] == "|||") item++;
00246 else {
00247 if (item == 1)
00248 phraseF.push_back( vcbF.storeIfNew( token[j] ) );
00249 else if (item == 2)
00250 phraseE.push_back( vcbE.storeIfNew( token[j] ) );
00251 else if (item == 3) {
00252 int e,f;
00253 sscanf(token[j].c_str(), "%d-%d", &f, &e);
00254 if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) {
00255 cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n";
00256 } else {
00257 if (alignedToE.size() == 0) {
00258 vector< size_t > dummy;
00259 for(size_t i=0; i<phraseE.size(); i++)
00260 alignedToE.push_back( dummy );
00261 for(size_t i=0; i<phraseF.size(); i++)
00262 alignedToF.push_back( dummy );
00263 foreign = phraseTableF.storeIfNew( phraseF );
00264 english = phraseTableE.storeIfNew( phraseE );
00265 }
00266 alignedToE[e].push_back( f );
00267 alignedToF[f].push_back( e );
00268 }
00269 }
00270 }
00271 }
00272 return (item>2);
00273 }
00274
00275 void PhraseAlignment::clear()
00276 {
00277 for(size_t i=0; i<alignedToE.size(); i++)
00278 alignedToE[i].clear();
00279 for(size_t i=0; i<alignedToF.size(); i++)
00280 alignedToF[i].clear();
00281 alignedToE.clear();
00282 alignedToF.clear();
00283 }
00284
00285 bool PhraseAlignment::equals( const PhraseAlignment& other )
00286 {
00287 if (this == &other) return true;
00288 if (other.english != english) return false;
00289 if (other.foreign != foreign) return false;
00290 PHRASE phraseE = phraseTableE.getPhrase( english );
00291 PHRASE phraseF = phraseTableF.getPhrase( foreign );
00292 for(size_t i=0; i<phraseE.size(); i++) {
00293 if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
00294 for(size_t j=0; j<alignedToE[i].size(); j++) {
00295 if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
00296 }
00297 }
00298 for(size_t i=0; i<phraseF.size(); i++) {
00299 if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
00300 for(size_t j=0; j<alignedToF[i].size(); j++) {
00301 if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
00302 }
00303 }
00304 return true;
00305 }
00306
00307 void LexicalTable::load( const string &filePath )
00308 {
00309 cerr << "Loading lexical translation table from " << filePath;
00310 ifstream inFile;
00311 inFile.open(filePath.c_str());
00312 if (inFile.fail()) {
00313 cerr << " - ERROR: could not open file\n";
00314 exit(1);
00315 }
00316 istream *inFileP = &inFile;
00317
00318 string line;
00319
00320 int i=0;
00321 while(getline(*inFileP, line)) {
00322 i++;
00323 if (i%100000 == 0) cerr << "." << flush;
00324
00325 const vector<string> token = util::tokenize( line );
00326 if (token.size() != 3) {
00327 cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
00328 token.size() << " " << token[0] << " " << line << endl;
00329 continue;
00330 }
00331
00332 double prob = atof( token[2].c_str() );
00333 WORD_ID wordE = vcbE.storeIfNew( token[0] );
00334 WORD_ID wordF = vcbF.storeIfNew( token[1] );
00335 ltable[ wordF ][ wordE ] = prob;
00336 }
00337 cerr << endl;
00338 }