00001 #pragma once
00002
00003 #include <iostream>
00004 #include <sstream>
00005 #include <stdexcept>
00006 #include <string>
00007 #include <vector>
00008 #include <limits>
00009 #include "Types.h"
00010 #include "ScoreData.h"
00011
00012 namespace mert
00013 {
00014
00015 class Vocabulary;
00016
00017 }
00018
00019 namespace MosesTuning
00020 {
00021
00022 class PreProcessFilter;
00023 class ScoreStats;
00024
00025 enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
00026
00033 class Scorer
00034 {
00035 public:
00036 Scorer(const std::string& name, const std::string& config);
00037 virtual ~Scorer();
00038
00042 virtual std::size_t NumberOfScores() const = 0;
00043
00047 virtual float calculateScore(const std::vector<ScoreStatsType>& totals) const = 0;
00048
00049 float calculateSentenceLevelBackgroundScore(const std::vector<ScoreStatsType>& totals, const std::vector<ScoreStatsType>& bg) {
00050 std::vector<ScoreStatsType> stats(totals.size());
00051 for(size_t i=0; i<stats.size(); i++)
00052 stats[i] = totals[i]+bg[i];
00053
00054 return calculateScore(stats) * getReferenceLength(stats);
00055 }
00056
00060 virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles) {
00061
00062 }
00063
00068 virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry) {
00069
00070 }
00071
00072 virtual void prepareStats(const std::string& sindex, const std::string& text, ScoreStats& entry) {
00073 this->prepareStats(static_cast<std::size_t>(atoi(sindex.c_str())), text, entry);
00074 }
00075
00080 virtual void score(const candidates_t& candidates, const diffs_t& diffs,
00081 statscores_t& scores) const = 0;
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00099 float score(const candidates_t& candidates) const;
00100
00101 const std::string& getName() const {
00102 return m_name;
00103 }
00104
00105 std::size_t getReferenceSize() const {
00106 if (m_score_data) {
00107 return m_score_data->size();
00108 }
00109 return 0;
00110 }
00111
00115 virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const = 0;
00116
00120 virtual void setScoreData(ScoreData* data) {
00121 m_score_data = data;
00122 }
00123
00128 virtual bool useAlignment() const {
00129
00130 return false;
00131 };
00132
00136 virtual void setFactors(const std::string& factors);
00137
00138 mert::Vocabulary* GetVocab() const {
00139 return m_vocab;
00140 }
00141
00145 virtual void setFilter(const std::string& filterCommand);
00146
00147 private:
00148 void InitConfig(const std::string& config);
00149
00153 std::string applyFactors(const std::string& sentece) const;
00154
00158 std::string applyFilter(const std::string& sentence) const;
00159
00160 std::string m_name;
00161 mert::Vocabulary* m_vocab;
00162 std::map<std::string, std::string> m_config;
00163 std::vector<int> m_factors;
00164
00165 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00166 PreProcessFilter* m_filter;
00167 #endif
00168
00169 protected:
00170 ScoreData* m_score_data;
00171 bool m_enable_preserve_case;
00172
00176 std::string getConfig(const std::string& key, const std::string& def="") const {
00177 std::map<std::string,std::string>::const_iterator i = m_config.find(key);
00178 if (i == m_config.end()) {
00179 return def;
00180 } else {
00181 return i->second;
00182 }
00183 }
00184
00189 void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;
00190
00191
00192
00193
00194 void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;
00195
00199 std::string preprocessSentence(const std::string& sentence) const {
00200 return applyFactors(applyFilter(sentence));
00201 }
00202
00203 };
00204
00205 namespace
00206 {
00207
00208
00209 inline float score_min(const statscores_t& scores, size_t start, size_t end)
00210 {
00211 float min = std::numeric_limits<float>::max();
00212 for (size_t i = start; i < end; ++i) {
00213 if (scores[i] < min) {
00214 min = scores[i];
00215 }
00216 }
00217 return min;
00218 }
00219
00220 inline float score_average(const statscores_t& scores, size_t start, size_t end)
00221 {
00222 if ((end - start) < 1) {
00223
00224 return 0;
00225 }
00226 float total = 0;
00227 for (size_t j = start; j < end; ++j) {
00228 total += scores[j];
00229 }
00230
00231 return total / (end - start);
00232 }
00233
00234 }
00235
00236 }
00237