00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <cmath>
00021 #include <limits>
00022 #include <list>
00023
00024 #include <boost/unordered_set.hpp>
00025
00026 #include "util/file_piece.hh"
00027 #include "util/tokenize_piece.hh"
00028
00029 #include "BleuScorer.h"
00030 #include "ForestRescore.h"
00031
00032 using namespace std;
00033
00034 namespace MosesTuning
00035 {
00036
00037 std::ostream& operator<<(std::ostream& out, const WordVec& wordVec)
00038 {
00039 out << "[";
00040 for (size_t i = 0; i < wordVec.size(); ++i) {
00041 out << wordVec[i]->first;
00042 if (i+1< wordVec.size()) out << " ";
00043 }
00044 out << "]";
00045 return out;
00046 }
00047
00048
00049 void ReferenceSet::Load(const vector<string>& files, Vocab& vocab)
00050 {
00051 for (size_t i = 0; i < files.size(); ++i) {
00052 util::FilePiece fh(files[i].c_str());
00053 size_t sentenceId = 0;
00054 while(true) {
00055 StringPiece line;
00056 try {
00057 line = fh.ReadLine();
00058 } catch (util::EndOfFileException &e) {
00059 break;
00060 }
00061 AddLine(sentenceId, line, vocab);
00062 ++sentenceId;
00063 }
00064 }
00065
00066 }
00067
00068 void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab)
00069 {
00070
00071 NgramCounter ngramCounts;
00072 list<WordVec> openNgrams;
00073 size_t length = 0;
00074
00075 for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) {
00076 const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j));
00077 ++length;
00078 openNgrams.push_front(WordVec());
00079 for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
00080 k->push_back(nextTok);
00081 ++ngramCounts[*k];
00082 }
00083 if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
00084 }
00085
00086
00087 for (NgramCounter::const_iterator ni = ngramCounts.begin();
00088 ni != ngramCounts.end(); ++ni) {
00089 size_t count = ni->second;
00090
00091 if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1);
00092 NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first);
00093 if (totalsIter == ngramCounts_[sentenceId].end()) {
00094 ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count);
00095 } else {
00096 ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first);
00097 ngramCounts_[sentenceId][ni->first].second += count;
00098 }
00099 }
00100
00101 if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1);
00102
00103 if (!lengths_[sentenceId]) {
00104 lengths_[sentenceId] = length;
00105 } else {
00106 lengths_[sentenceId] = min(length,lengths_[sentenceId]);
00107 }
00108
00109
00110 }
00111
00112 size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const
00113 {
00114 const NgramMap& ngramCounts = ngramCounts_.at(sentenceId);
00115 NgramMap::const_iterator ngi = ngramCounts.find(ngram);
00116 if (ngi == ngramCounts.end()) return 0;
00117 return clip ? ngi->second.first : ngi->second.second;
00118 }
00119
00120 VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {}
00121
00122 void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const
00123 {
00124 for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) {
00125
00126 size_t order = ngi->first.size();
00127 size_t count = ngi->second;
00128 bleuStats[(order-1)*2 + 1] += count;
00129 bleuStats[(order-1) * 2] += min(count, references_.NgramMatches(sentenceId_,ngi->first,false));
00130 }
00131 }
00132
00133 size_t HgBleuScorer::GetTargetLength(const Edge& edge) const
00134 {
00135 size_t targetLength = 0;
00136 for (size_t i = 0; i < edge.Words().size(); ++i) {
00137 const Vocab::Entry* word = edge.Words()[i];
00138 if (word) ++targetLength;
00139 }
00140 for (size_t i = 0; i < edge.Children().size(); ++i) {
00141 const VertexState& state = vertexStates_[edge.Children()[i]];
00142 targetLength += state.targetLength;
00143 }
00144 return targetLength;
00145 }
00146
00147 FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats)
00148 {
00149 NgramCounter ngramCounts;
00150 size_t childId = 0;
00151 size_t wordId = 0;
00152 size_t contextId = 0;
00153 const VertexState* vertexState = NULL;
00154 bool inLeftContext = false;
00155 bool inRightContext = false;
00156 list<WordVec> openNgrams;
00157 const Vocab::Entry* currentWord = NULL;
00158 while (wordId < edge.Words().size()) {
00159 currentWord = edge.Words()[wordId];
00160 if (currentWord != NULL) {
00161 ++wordId;
00162 } else {
00163 if (!inLeftContext && !inRightContext) {
00164
00165 assert(!vertexState);
00166 vertexState = &(vertexStates_[edge.Children()[childId]]);
00167 ++childId;
00168 if (vertexState->leftContext.size()) {
00169 inLeftContext = true;
00170 contextId = 0;
00171 currentWord = vertexState->leftContext[contextId];
00172 } else {
00173
00174 vertexState = NULL;
00175 ++wordId;
00176 continue;
00177 }
00178 } else {
00179
00180 ++contextId;
00181 if (inLeftContext && contextId < vertexState->leftContext.size()) {
00182
00183 currentWord = vertexState->leftContext[contextId];
00184 } else if (inLeftContext) {
00185
00186 if (vertexState->leftContext.size() == kBleuNgramOrder-1) {
00187
00188 openNgrams.clear();
00189 inLeftContext = false;
00190 inRightContext = true;
00191 contextId = 0;
00192 currentWord = vertexState->rightContext[contextId];
00193 } else {
00194
00195 inLeftContext = false;
00196 vertexState = NULL;
00197 ++wordId;
00198 continue;
00199 }
00200 } else {
00201
00202 if (contextId < vertexState->rightContext.size()) {
00203 currentWord = vertexState->rightContext[contextId];
00204 } else {
00205
00206 inRightContext = false;
00207 vertexState = NULL;
00208 ++wordId;
00209 continue;
00210 }
00211 }
00212 }
00213 }
00214 assert(currentWord);
00215 if (graph_.IsBoundary(currentWord)) continue;
00216 openNgrams.push_front(WordVec());
00217 openNgrams.front().reserve(kBleuNgramOrder);
00218 for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
00219 k->push_back(currentWord);
00220
00221 if (!vertexState || (inLeftContext && k->size() > contextId+1)) ++ngramCounts[*k];
00222 }
00223 if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
00224 }
00225
00226
00227
00228
00229 UpdateMatches(ngramCounts, bleuStats);
00230
00231
00232 for (size_t i = 0; i < edge.Children().size(); ++i) {
00233
00234 for (size_t j = 0; j < bleuStats.size(); ++j) {
00235 bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j];
00236 }
00237 }
00238
00239
00240 FeatureStatsType sourceLength = head.SourceCovered();
00241 size_t referenceLength = references_.Length(sentenceId_);
00242 FeatureStatsType effectiveReferenceLength =
00243 sourceLength / totalSourceLength_ * referenceLength;
00244
00245 bleuStats[bleuStats.size()-1] = effectiveReferenceLength;
00246
00247
00248 FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_);
00249
00250 return bleu;
00251 }
00252
00253 void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats)
00254 {
00255
00256 VertexState& vertexState = vertexStates_[vertexId];
00257
00258
00259
00260 int wi = 0;
00261 const VertexState* childState = NULL;
00262 int contexti = 0;
00263 int childi = 0;
00264 while (vertexState.leftContext.size() < (kBleuNgramOrder-1)) {
00265 if ((size_t)wi >= winnerEdge.Words().size()) break;
00266 const Vocab::Entry* word = winnerEdge.Words()[wi];
00267 if (word != NULL) {
00268 vertexState.leftContext.push_back(word);
00269 ++wi;
00270 } else {
00271 if (childState == NULL) {
00272
00273 childState = &(vertexStates_[winnerEdge.Children()[childi++]]);
00274 contexti = 0;
00275 }
00276 if ((size_t)contexti < childState->leftContext.size()) {
00277 vertexState.leftContext.push_back(childState->leftContext[contexti++]);
00278 } else {
00279
00280 childState = NULL;
00281 ++wi;
00282 }
00283 }
00284 }
00285
00286
00287 wi = winnerEdge.Words().size() - 1;
00288 childState = NULL;
00289 childi = winnerEdge.Children().size() - 1;
00290 while (vertexState.rightContext.size() < (kBleuNgramOrder-1)) {
00291 if (wi < 0) break;
00292 const Vocab::Entry* word = winnerEdge.Words()[wi];
00293 if (word != NULL) {
00294 vertexState.rightContext.push_back(word);
00295 --wi;
00296 } else {
00297 if (childState == NULL) {
00298
00299 childState = &(vertexStates_[winnerEdge.Children()[childi--]]);
00300 contexti = childState->rightContext.size()-1;
00301 }
00302 if (contexti >= 0) {
00303 vertexState.rightContext.push_back(childState->rightContext[contexti--]);
00304 } else {
00305
00306 childState = NULL;
00307 --wi;
00308 }
00309 }
00310 }
00311 reverse(vertexState.rightContext.begin(), vertexState.rightContext.end());
00312
00313
00314 vertexState.targetLength = GetTargetLength(winnerEdge);
00315 vertexState.bleuStats = bleuStats;
00316 }
00317
00318
00319 typedef pair<const Edge*,FeatureStatsType> BackPointer;
00320
00321
00325 static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps,
00326 HgHypothesis* bestHypo)
00327 {
00328
00329
00330 if (!bps[vertexId].first) return;
00331 const Edge* prevEdge = bps[vertexId].first;
00332 bestHypo->featureVector += *(prevEdge->Features().get());
00333 size_t childId = 0;
00334 for (size_t i = 0; i < prevEdge->Words().size(); ++i) {
00335 if (prevEdge->Words()[i] != NULL) {
00336 bestHypo->text.push_back(prevEdge->Words()[i]);
00337 } else {
00338 size_t childVertexId = prevEdge->Children()[childId++];
00339 HgHypothesis childHypo;
00340 GetBestHypothesis(childVertexId,graph,bps,&childHypo);
00341 bestHypo->text.insert(bestHypo->text.end(), childHypo.text.begin(), childHypo.text.end());
00342 bestHypo->featureVector += childHypo.featureVector;
00343 }
00344 }
00345 }
00346
00347 void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo)
00348 {
00349 BackPointer init((const Edge*) NULL,kMinScore);
00350 vector<BackPointer> backPointers(graph.VertexSize(),init);
00351 HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu);
00352 vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1);
00353 for (size_t vi = 0; vi < graph.VertexSize(); ++vi) {
00354
00355 FeatureStatsType winnerScore = kMinScore;
00356 const Vertex& vertex = graph.GetVertex(vi);
00357 const vector<const Edge*>& incoming = vertex.GetIncoming();
00358 if (!incoming.size()) {
00359
00360
00361 backPointers[vi].first = NULL;
00362 backPointers[vi].second = kMinScore;
00363 } else {
00364
00365 for (size_t ei = 0; ei < incoming.size(); ++ei) {
00366
00367 FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
00368 for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
00369 size_t childId = incoming[ei]->Children()[i];
00370
00371
00372 incomingScore = max(incomingScore + backPointers[childId].second, kMinScore);
00373 }
00374 vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1);
00375
00376
00377 FeatureStatsType totalScore = incomingScore;
00378 if (bleuWeight) {
00379 FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats);
00380 if (isnan(bleuScore)) {
00381 cerr << "WARN: bleu score undefined" << endl;
00382 cerr << "\tVertex id : " << vi << endl;
00383 cerr << "\tBleu stats : ";
00384 for (size_t i = 0; i < bleuStats.size(); ++i) {
00385 cerr << bleuStats[i] << ",";
00386 }
00387 cerr << endl;
00388 bleuScore = 0;
00389 }
00390
00391 totalScore += bleuWeight * bleuScore;
00392
00393
00394 }
00395 if (totalScore >= winnerScore) {
00396
00397
00398 winnerScore = totalScore;
00399 backPointers[vi].first = incoming[ei];
00400 backPointers[vi].second = incomingScore;
00401 winnerStats = bleuStats;
00402 }
00403 }
00404
00405
00406
00407 if (backPointers[vi].first) {
00408 bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats);
00409 }
00410
00411 }
00412
00413 }
00414
00415
00416 GetBestHypothesis(graph.VertexSize()-1, graph, backPointers, bestHypo);
00417
00418
00419
00420
00421
00422 bestHypo->bleuStats.resize(kBleuNgramOrder*2+1);
00423 NgramCounter counts;
00424 list<WordVec> openNgrams;
00425 for (size_t i = 0; i < bestHypo->text.size(); ++i) {
00426 const Vocab::Entry* entry = bestHypo->text[i];
00427 if (graph.IsBoundary(entry)) continue;
00428 openNgrams.push_front(WordVec());
00429 for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
00430 k->push_back(entry);
00431 ++counts[*k];
00432 }
00433 if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
00434 }
00435 for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) {
00436 size_t order = ngi->first.size();
00437 size_t count = ngi->second;
00438 bestHypo->bleuStats[(order-1)*2 + 1] += count;
00439 bestHypo->bleuStats[(order-1) * 2] += min(count, references.NgramMatches(sentenceId,ngi->first,true));
00440 }
00441 bestHypo->bleuStats[kBleuNgramOrder*2] = references.Length(sentenceId);
00442 }
00443
00444
00445 };