00001 #include "BleuDocScorer.h"
00002
00003 #include <sys/types.h>
00004 #include <algorithm>
00005 #include <cassert>
00006 #include <cmath>
00007 #include <climits>
00008 #include <fstream>
00009 #include <iostream>
00010 #include <stdexcept>
00011
00012 #include "util/exception.hh"
00013 #include "Ngram.h"
00014 #include "Reference.h"
00015 #include "Util.h"
00016 #include "Vocabulary.h"
00017
00018
00019 using namespace std;
00020
00021 #if defined __MINGW32__
00022 #ifndef uint
00023 #define uint uint16_t
00024 #endif // uint
00025 #endif // if
00026
00027 namespace
00028 {
00029
00030
00031 const char KEY_REFLEN[] = "reflen";
00032 const char REFLEN_AVERAGE[] = "average";
00033 const char REFLEN_SHORTEST[] = "shortest";
00034 const char REFLEN_CLOSEST[] = "closest";
00035
00036 }
00037
00038 namespace MosesTuning
00039 {
00040
00041
00042 BleuDocScorer::BleuDocScorer(const string& config)
00043 : BleuScorer("BLEUDOC", config),
00044 m_ref_length_type(CLOSEST)
00045 {
00046 const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
00047 if (reflen == REFLEN_AVERAGE) {
00048 m_ref_length_type = AVERAGE;
00049 } else if (reflen == REFLEN_SHORTEST) {
00050 m_ref_length_type = SHORTEST;
00051 } else if (reflen == REFLEN_CLOSEST) {
00052 m_ref_length_type = CLOSEST;
00053 } else {
00054 throw runtime_error("Unknown reference length strategy: " + reflen);
00055 }
00056 }
00057
00058 BleuDocScorer::~BleuDocScorer() {}
00059
00060
00061 bool BleuDocScorer::OpenReferenceStream(istream* is, size_t file_id)
00062 {
00063 if (is == NULL) return false;
00064
00065 string line;
00066 size_t doc_id = -1;
00067 size_t sid = 0;
00068 while (getline(*is, line)) {
00069
00070 if (line.find("<doc docid") != std::string::npos) {
00071 doc_id++;
00072 m_references.push_back(new ScopedVector<Reference>());
00073 sid = 0;
00074 } else if (line.find("<seg") != std::string::npos) {
00075 int start = line.find_first_of('>') + 1;
00076 std::string trans = line.substr(start, line.find_last_of('<')-start);
00077 trans = preprocessSentence(trans);
00078
00079 if (file_id == 0) {
00080 Reference* ref = new Reference;
00081 m_references[doc_id]->push_back(ref);
00082 }
00083
00084 if (m_references[doc_id]->size() <= sid) {
00085 return false;
00086 }
00087 NgramCounts counts;
00088 size_t length = CountNgrams(trans, counts, kBleuNgramOrder);
00089
00090
00091 for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
00092 const NgramCounts::Key& ngram = ci->first;
00093 const NgramCounts::Value newcount = ci->second;
00094
00095 NgramCounts::Value oldcount = 0;
00096 m_references[doc_id]->get().at(sid)->get_counts()->Lookup(ngram, &oldcount);
00097 if (newcount > oldcount) {
00098 m_references[doc_id]->get().at(sid)->get_counts()->operator[](ngram) = newcount;
00099 }
00100 }
00101
00102
00103 m_references[doc_id]->get().at(sid)->push_back(length);
00104 if (sid > 0 && sid % 100 == 0) {
00105 TRACE_ERR(".");
00106 }
00107 ++sid;
00108 }
00109 }
00110 return true;
00111 }
00112
00113 void BleuDocScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00114 {
00115 if (sid >= m_references.size()) {
00116 stringstream msg;
00117 msg << "Sentence id (" << sid << ") not found in reference set";
00118 throw runtime_error(msg.str());
00119 }
00120
00121 std::vector<std::string> sentences = splitDoc(text);
00122
00123 vector<ScoreStatsType> totStats(kBleuNgramOrder * 2 + 1);
00124
00125 for (uint i=0; i<sentences.size(); ++i) {
00126
00127 NgramCounts testcounts;
00128
00129 vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
00130 string sentence = preprocessSentence(sentences[i]);
00131 const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder);
00132
00133
00134 for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
00135 testcounts_it != testcounts.end(); ++testcounts_it) {
00136 const NgramCounts::Value guess = testcounts_it->second;
00137 const size_t len = testcounts_it->first.size();
00138 NgramCounts::Value correct = 0;
00139
00140 NgramCounts::Value v = 0;
00141 if (m_references[sid]->get().at(i)->get_counts()->Lookup(testcounts_it->first, &v)) {
00142 correct = min(v, guess);
00143 }
00144 stats[len * 2 - 2] += correct;
00145 stats[len * 2 - 1] += guess;
00146 }
00147
00148 const int reference_len = CalcReferenceLength(sid, i, length);
00149 stats.push_back(reference_len);
00150
00151
00152 std::transform(stats.begin(), stats.end(), totStats.begin(),
00153 totStats.begin(), std::plus<int>());
00154 }
00155 entry.set(totStats);
00156 }
00157
00158 std::vector<std::string> BleuDocScorer::splitDoc(const std::string& text)
00159 {
00160 std::vector<std::string> res;
00161
00162 uint index = 0;
00163 std::string::size_type end;
00164
00165 while ((end = text.find(" \\n ", index)) != std::string::npos) {
00166 res.push_back(text.substr(index,end-index));
00167 index = end + 4;
00168 }
00169 return res;
00170 }
00171
00172 statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
00173 {
00174 UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
00175
00176 float logbleu = 0.0;
00177 for (size_t i = 0; i < kBleuNgramOrder; ++i) {
00178 if (comps[2*i] == 0) {
00179 return 0.0;
00180 }
00181 logbleu += log(comps[2*i]) - log(comps[2*i+1]);
00182
00183 }
00184 logbleu /= kBleuNgramOrder;
00185
00186 const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
00187 if (brevity < 0.0) {
00188 logbleu += brevity;
00189 }
00190 return exp(logbleu);
00191 }
00192
00193 int BleuDocScorer::CalcReferenceLength(size_t doc_id, size_t sentence_id, size_t length)
00194 {
00195 switch (m_ref_length_type) {
00196 case AVERAGE:
00197 return m_references[doc_id]->get().at(sentence_id)->CalcAverage();
00198 break;
00199 case CLOSEST:
00200 return m_references[doc_id]->get().at(sentence_id)->CalcClosest(length);
00201 break;
00202 case SHORTEST:
00203 return m_references[doc_id]->get().at(sentence_id)->CalcShortest();
00204 break;
00205 default:
00206 cerr << "unknown reference types." << endl;
00207 exit(1);
00208 }
00209 }
00210
00211 }
00212