00001 #include "PerScorer.h"
00002
00003 #include <fstream>
00004 #include <stdexcept>
00005
00006 #include "ScoreStats.h"
00007 #include "Util.h"
00008
00009 using namespace std;
00010
00011 namespace MosesTuning
00012 {
00013
00014
00015 PerScorer::PerScorer(const string& config)
00016 : StatisticsBasedScorer("PER",config) {}
00017
00018 PerScorer::~PerScorer() {}
00019
00020 void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
00021 {
00022
00023
00024 if (referenceFiles.size() != 1) {
00025 throw runtime_error("PER only supports a single reference");
00026 }
00027 m_ref_tokens.clear();
00028 m_ref_lengths.clear();
00029 ifstream in(referenceFiles[0].c_str());
00030 if (!in) {
00031 throw runtime_error("Unable to open " + referenceFiles[0]);
00032 }
00033 string line;
00034 int sid = 0;
00035 while (getline(in,line)) {
00036 line = this->preprocessSentence(line);
00037 vector<int> tokens;
00038 TokenizeAndEncode(line, tokens);
00039 m_ref_tokens.push_back(multiset<int>());
00040 for (size_t i = 0; i < tokens.size(); ++i) {
00041 m_ref_tokens.back().insert(tokens[i]);
00042 }
00043 m_ref_lengths.push_back(tokens.size());
00044 if (sid > 0 && sid % 100 == 0) {
00045 TRACE_ERR(".");
00046 }
00047 ++sid;
00048 }
00049 TRACE_ERR(endl);
00050
00051 }
00052
00053 void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00054 {
00055 if (sid >= m_ref_lengths.size()) {
00056 stringstream msg;
00057 msg << "Sentence id (" << sid << ") not found in reference set";
00058 throw runtime_error(msg.str());
00059 }
00060
00061 string sentence = this->preprocessSentence(text);
00062
00063
00064
00065 vector<int> testtokens;
00066 TokenizeAndEncode(sentence, testtokens);
00067 multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
00068 set<int> testtokens_unique(testtokens.begin(),testtokens.end());
00069 int correct = 0;
00070 for (set<int>::iterator i = testtokens_unique.begin();
00071 i != testtokens_unique.end(); ++i) {
00072 int token = *i;
00073 correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token));
00074 }
00075
00076 ostringstream stats;
00077 stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ;
00078 string stats_str = stats.str();
00079 entry.set(stats_str);
00080 }
00081
00082 float PerScorer::calculateScore(const vector<ScoreStatsType>& comps) const
00083 {
00084 float denom = comps[2];
00085 float num = comps[0] - max(0.0f,comps[1]-comps[2]);
00086 if (denom == 0) {
00087
00088 return 0.0;
00089 } else {
00090 return num/denom;
00091 }
00092 }
00093
00094 }
00095