00001 #include "SemposScorer.h"
00002
00003 #include <algorithm>
00004 #include <vector>
00005 #include <stdexcept>
00006 #include <fstream>
00007
00008 #include "Util.h"
00009 #include "SemposOverlapping.h"
00010
00011 using namespace std;
00012
00013 namespace MosesTuning
00014 {
00015
00016
00017 SemposScorer::SemposScorer(const string& config)
00018 : StatisticsBasedScorer("SEMPOS", config),
00019 m_ovr(SemposOverlappingFactory::GetOverlapping(getConfig("overlapping", "cap-micro"),this)),
00020 m_enable_debug(false)
00021 {
00022 const string& debugSwitch = getConfig("debug", "0");
00023 if (debugSwitch == "1") m_enable_debug = true;
00024
00025 m_semposMap.clear();
00026
00027 string weightsfile = getConfig("weightsfile", "");
00028 if (weightsfile != "") {
00029 loadWeights(weightsfile);
00030 }
00031 }
00032
00033 SemposScorer::~SemposScorer() {}
00034
00035 void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
00036 {
00037
00038 m_ref_sentences.clear();
00039
00040
00041 for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
00042 ifstream refin(referenceFiles[rid].c_str());
00043 if (!refin) {
00044 throw runtime_error("Unable to open: " + referenceFiles[rid]);
00045 }
00046 m_ref_sentences.push_back(vector<sentence_t>());
00047 string line;
00048 while (getline(refin,line)) {
00049 line = preprocessSentence(line);
00050
00051 str_sentence_t sentence;
00052 splitSentence(line, sentence);
00053
00054 sentence_t encodedSentence;
00055 encodeSentence(sentence, encodedSentence);
00056
00057 m_ref_sentences[rid].push_back(encodedSentence);
00058 }
00059 }
00060 }
00061
00062 void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00063 {
00064 vector<ScoreStatsType> stats;
00065
00066 const string& sentence = preprocessSentence(text);
00067 str_sentence_t splitCandSentence;
00068 splitSentence(sentence, splitCandSentence);
00069
00070 sentence_t encodedCandSentence;
00071 encodeSentence(splitCandSentence, encodedCandSentence);
00072
00073 if (m_ref_sentences.size() == 1) {
00074 stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]);
00075 } else {
00076 float max = -1.0f;
00077 for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
00078 const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]);
00079 if (m_ovr->calculateScore(tmp) > max) {
00080 stats = tmp;
00081 }
00082 }
00083 }
00084 entry.set(stats);
00085 }
00086
00087 void SemposScorer::splitSentence(const string& sentence, str_sentence_t& splitSentence)
00088 {
00089 splitSentence.clear();
00090
00091 vector<string> tokens;
00092 split(sentence, ' ', tokens);
00093 for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
00094 vector<string> factors;
00095 if (it->empty()) continue;
00096 split(*it, '|', factors);
00097 if (factors.size() != 2) throw runtime_error("Sempos scorer accepts two factors (item|class)");
00098 const string& item = factors[0];
00099 const string& klass = factors[1];
00100 splitSentence.push_back(make_pair(item, klass));
00101 }
00102 }
00103
00104 void SemposScorer::encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence)
00105 {
00106 for (str_sentence_it it = sentence.begin(); it != sentence.end(); ++it) {
00107 const int tlemma = encodeString(it->first);
00108 const int sempos = encodeSempos(it->second);
00109 if (sempos >= 0) {
00110 encodedSentence.insert(make_pair(tlemma,sempos));
00111 }
00112 }
00113 }
00114
00115 int SemposScorer::encodeString(const string& str)
00116 {
00117 encoding_it encoding = m_stringMap.find(str);
00118 int encoded_str;
00119 if (encoding == m_stringMap.end()) {
00120 encoded_str = static_cast<int>(m_stringMap.size());
00121 m_stringMap[str] = encoded_str;
00122 } else {
00123 encoded_str = encoding->second;
00124 }
00125 return encoded_str;
00126 }
00127
00128 int SemposScorer::encodeSempos(const string& sempos)
00129 {
00130 if (sempos == "-") return -1;
00131 encoding_it it = m_semposMap.find(sempos);
00132 if (it == m_semposMap.end()) {
00133 const int classNumber = static_cast<int>(m_semposMap.size());
00134 if (classNumber == kMaxNOC) {
00135 throw std::runtime_error("Number of classes is greater than kMaxNOC");
00136 }
00137 m_semposMap[sempos] = classNumber;
00138 return classNumber;
00139 } else {
00140 return it->second;
00141 }
00142 }
00143
00144 float SemposScorer::weight(int item) const
00145 {
00146 std::map<int,float>::const_iterator it = weightsMap.find(item);
00147 if (it == weightsMap.end()) {
00148 return 1.0f;
00149 } else {
00150 return it->second;
00151 }
00152 }
00153
00154 void SemposScorer::loadWeights(const string& weightsfile)
00155 {
00156 string line;
00157 ifstream myfile;
00158 myfile.open(weightsfile.c_str(), ifstream::in);
00159 if (myfile.is_open()) {
00160 while ( myfile.good() ) {
00161 getline (myfile,line);
00162 vector<string> fields;
00163 if (line == "") continue;
00164 split(line, '\t', fields);
00165 if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
00166 int encoded = encodeString(fields[0]);
00167 float weight = atof(fields[1].c_str());
00168 weightsMap[encoded] = weight;
00169 }
00170 myfile.close();
00171 } else {
00172 cerr << "Unable to open file "<< weightsfile << endl;
00173 exit(1);
00174 }
00175
00176 }
00177
00178 }
00179