00001 #pragma once
00002
00003 #include <cmath>
00004 #include <string>
00005 #include <vector>
00006 #include <set>
00007 #include <map>
00008 #include <queue>
00009 #include <iostream>
00010 #include <fstream>
00011 #include <iterator>
00012 #include <algorithm>
00013 #include <limits>
00014 #include <sstream>
00015 #include <boost/algorithm/string.hpp>
00016
00017
00018
00019 namespace MosesTuning
00020 {
00021
00022 namespace M2
00023 {
00024
00025 typedef std::vector<float> Stats;
00026
00027 typedef std::vector<std::string> Sentence;
00028
00029 std::ostream& operator<<(std::ostream& o, Sentence s);
00030
00031 const std::string ToLower(const std::string& str);
00032
00033 struct Annot {
00034 size_t i;
00035 size_t j;
00036
00037 std::string type;
00038 std::string edit;
00039
00040 size_t annotator;
00041
00042 bool operator<(Annot a) const {
00043 return i < a.i || (i == a.i && j < a.j)
00044 || (i == a.i && j == a.j && annotator < a.annotator)
00045 || (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit));
00046 }
00047
00048 bool operator==(Annot a) const {
00049 return (!(*this < a) && !(a < *this));
00050 }
00051
00052 static std::string transform(const std::string& e);
00053
00054 static bool lowercase;
00055 };
00056
00057 typedef std::set<Annot> Annots;
00058 typedef std::set<size_t> Users;
00059
00060 struct Unit {
00061 Sentence first;
00062 Annots second;
00063 Users third;
00064 };
00065
00066 typedef std::vector<Unit> M2File;
00067
00068 struct Edit {
00069 Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "")
00070 : cost(c), changed(ch), unchanged(unch), edit(e) {}
00071
00072 float cost;
00073 size_t changed;
00074 size_t unchanged;
00075 std::string edit;
00076 };
00077
00078 Edit operator+(Edit& e1, Edit& e2);
00079
00080 struct Vertex {
00081 Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {}
00082
00083 bool operator<(const Vertex &v) const {
00084 return i < v.i || (i == v.i && j < v.j);
00085 }
00086
00087 bool operator==(const Vertex &v) const {
00088 return i == v.i && j == v.j;
00089 }
00090
00091 size_t i;
00092 size_t j;
00093 };
00094
00095 struct Edge {
00096 Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit())
00097 : v(vv), u(uu), edit(editt) {}
00098
00099 bool operator<(const Edge &e) const {
00100 return v < e.v || (v == e.v && u < e.u);
00101 }
00102
00103 Vertex v;
00104 Vertex u;
00105 Edit edit;
00106 };
00107
00108 Edge operator+(Edge e1, Edge e2);
00109
00110 typedef std::vector<size_t> Row;
00111 typedef std::vector<Row> Matrix;
00112
00113 struct Info {
00114 Info(Vertex vv = Vertex(), Edit editt = Edit())
00115 : v(vv), edit(editt) {}
00116
00117 bool operator<(const Info &i) const {
00118 return v < i.v;
00119 }
00120
00121 Vertex v;
00122 Edit edit;
00123 };
00124
00125 typedef std::set<Info> Track;
00126 typedef std::vector<Track> TrackRow;
00127 typedef std::vector<TrackRow> TrackMatrix;
00128
00129 typedef std::set<Vertex> Vertices;
00130 typedef std::set<Edge> Edges;
00131
00132 class M2
00133 {
00134 private:
00135 M2File m_m2;
00136
00137 size_t m_max_unchanged;
00138 float m_beta;
00139 bool m_lowercase;
00140 bool m_verbose;
00141
00142 public:
00143 M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { }
00144 M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false)
00145 : m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) {
00146 if(!m_lowercase) {
00147 Annot::lowercase = false;
00148 }
00149 }
00150
00151 float Beta() {
00152 return m_beta;
00153 }
00154
00155 void ReadM2(const std::string& filename) {
00156 std::ifstream m2file(filename.c_str());
00157 std::string line;
00158
00159 Unit unit;
00160 bool first = true;
00161
00162 while(std::getline(m2file, line)) {
00163 if(line.size() > 2) {
00164 if(line.substr(0, 2) == "S ") {
00165 if(!first) {
00166 if(unit.third.empty())
00167 unit.third.insert(0);
00168 m_m2.push_back(unit);
00169 }
00170 first = false;
00171
00172 unit.first = Sentence();
00173 unit.second = Annots();
00174
00175 std::string sentenceLine = line.substr(2);
00176 boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on);
00177 }
00178 if(line.substr(0, 2) == "A ") {
00179 std::string annotLine = line.substr(2);
00180
00181 std::vector<std::string> annot;
00182 boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||"));
00183
00184 if(annot[1] != "noop") {
00185 Annot a;
00186 std::stringstream rangeStr(annot[0]);
00187 rangeStr >> a.i >> a.j;
00188 a.type = annot[1];
00189 a.edit = annot[2];
00190
00191 std::stringstream annotStr(annot[5]);
00192 annotStr >> a.annotator;
00193
00194 unit.third.insert(a.annotator);
00195 unit.second.insert(a);
00196 } else {
00197 std::stringstream annotStr(annot[5]);
00198 size_t annotator;
00199 annotStr >> annotator;
00200 unit.third.insert(annotator);
00201 }
00202 }
00203 }
00204 }
00205 if(unit.third.empty())
00206 unit.third.insert(0);
00207 m_m2.push_back(unit);
00208 }
00209
00210 size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) {
00211 size_t n = s1.size();
00212 size_t m = s2.size();
00213
00214 if (n == 0)
00215 return m;
00216 if (m == 0)
00217 return n;
00218
00219 d.resize(n + 1, Row(m + 1, 0));
00220 bt.resize(n + 1, TrackRow(m + 1));
00221
00222 for(size_t i = 0; i <= n; ++i) {
00223 d[i][0] = i;
00224 if(i > 0)
00225 bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, "")));
00226 }
00227 for(size_t j = 0; j <= m; ++j) {
00228 d[0][j] = j;
00229 if(j > 0)
00230 bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1])));
00231 }
00232
00233 int cost;
00234 for(size_t i = 1; i <= n; ++i) {
00235 for(size_t j = 1; j <= m; ++j) {
00236 if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1]))
00237 cost = 0;
00238 else
00239 cost = 2;
00240
00241 size_t left = d[i][j - 1] + 1;
00242 size_t down = d[i - 1][j] + 1;
00243 size_t diag = d[i - 1][j - 1] + cost;
00244
00245 d[i][j] = std::min(left, std::min(down, diag));
00246
00247 if(d[i][j] == left)
00248 bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1])));
00249 if(d[i][j] == down)
00250 bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, "")));
00251 if(d[i][j] == diag)
00252 bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) ));
00253 }
00254 }
00255 return d[n][m];
00256 }
00257
00258
00259 void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) {
00260 Vertex start(bt.size() - 1, bt[0].size() - 1);
00261
00262 std::queue<Vertex> Q;
00263 Q.push(start);
00264 while(!Q.empty()) {
00265 Vertex v = Q.front();
00266 Q.pop();
00267 if(V.count(v) > 0)
00268 continue;
00269 V.insert(v);
00270 for(Track::iterator it = bt[v.i][v.j].begin();
00271 it != bt[v.i][v.j].end(); ++it) {
00272 Edge e(it->v, v, it->edit);
00273 E.insert(e);
00274 if(V.count(e.v) == 0)
00275 Q.push(e.v);
00276 }
00277 }
00278
00279 Edges newE;
00280 do {
00281 newE.clear();
00282 for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
00283 for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) {
00284 if(it1->u == it2->v) {
00285 Edge e = *it1 + *it2;
00286 if(e.edit.changed > 0 &&
00287 e.edit.unchanged <= m_max_unchanged &&
00288 E.count(e) == 0)
00289 newE.insert(e);
00290 }
00291 }
00292 }
00293 E.insert(newE.begin(), newE.end());
00294 } while(newE.size() > 0);
00295 }
00296
00297 void AddWeights(Edges &E, const Unit &u, size_t aid) {
00298 for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) {
00299 if(it1->edit.changed > 0) {
00300 const_cast<float&>(it1->edit.cost) += 0.001;
00301 for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) {
00302
00303 if(it1->v.i == it2->i && it1->u.i == it2->j
00304 && Annot::transform(it1->edit.edit) == Annot::transform(it2->edit)
00305 && it2->annotator == aid) {
00306 int newWeight = -(m_max_unchanged + 1) * E.size();
00307 const_cast<float&>(it1->edit.cost) = newWeight;
00308 }
00309 }
00310 }
00311 }
00312 }
00313
00314 void BellmanFord(Vertices &V, Edges &E) {
00315 Vertex source(0, 0);
00316 std::map<Vertex, float> distance;
00317 std::map<Vertex, Vertex> predecessor;
00318
00319 for(Vertices::iterator it = V.begin(); it != V.end(); ++it) {
00320 if(*it == source)
00321 distance[*it] = 0;
00322 else {
00323 distance[*it] = std::numeric_limits<float>::infinity();
00324 }
00325 }
00326
00327 for(size_t i = 1; i < V.size(); ++i) {
00328 for(Edges::iterator it = E.begin(); it != E.end(); ++it) {
00329 if(distance[it->v] + it->edit.cost < distance[it->u]) {
00330 distance[it->u] = distance[it->v] + it->edit.cost;
00331 predecessor[it->u] = it->v;
00332 }
00333 }
00334 }
00335
00336 Edges newE;
00337
00338 Vertex v = *V.rbegin();
00339 while(true) {
00340
00341 Edges::iterator it = E.find(Edge(predecessor[v], v));
00342 if(it != E.end()) {
00343 Edge f = *it;
00344
00345 newE.insert(f);
00346
00347 v = predecessor[v];
00348 if(v == source)
00349 break;
00350 } else {
00351 std::cout << "Error" << std::endl;
00352 break;
00353 }
00354 }
00355 E.clear();
00356 E.insert(newE.begin(), newE.end());
00357 }
00358
00359 void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) {
00360
00361 std::map<size_t, Stats> statsPerAnnotator;
00362 for(std::set<size_t>::iterator it = u.third.begin();
00363 it != u.third.end(); ++it) {
00364 statsPerAnnotator[*it] = Stats(4, 0);
00365 }
00366
00367 for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++)
00368 statsPerAnnotator[it->annotator][2]++;
00369
00370 for(std::set<size_t>::iterator ait = u.third.begin();
00371 ait != u.third.end(); ++ait) {
00372 for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) {
00373 if(eit->edit.changed > 0) {
00374 statsPerAnnotator[*ait][1]++;
00375 Annot f;
00376 f.i = eit->v.i;
00377 f.j = eit->u.i;
00378 f.annotator = *ait;
00379 f.edit = eit->edit.edit;
00380 for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) {
00381 if(f == *fit)
00382 statsPerAnnotator[*ait][0]++;
00383 }
00384 }
00385 }
00386 }
00387 size_t bestAnnot = 0;
00388 float bestF = -1;
00389 for(std::set<size_t>::iterator it = u.third.begin();
00390 it != u.third.end(); ++it) {
00391 Stats localStats = stats;
00392 localStats[0] += statsPerAnnotator[*it][0];
00393 localStats[1] += statsPerAnnotator[*it][1];
00394 localStats[2] += statsPerAnnotator[*it][2];
00395 if(m_verbose)
00396 std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl;
00397 float f = FScore(localStats);
00398 if(m_verbose)
00399 std::cerr << f << std::endl;
00400 if(f > bestF) {
00401 bestF = f;
00402 bestAnnot = *it;
00403 }
00404 }
00405 if(m_verbose)
00406 std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl;
00407 stats[0] += statsPerAnnotator[bestAnnot][0];
00408 stats[1] += statsPerAnnotator[bestAnnot][1];
00409 stats[2] += statsPerAnnotator[bestAnnot][2];
00410 }
00411
00412 void SufStats(const std::string &sStr, size_t i, Stats &stats) {
00413 std::string temp = sStr;
00414
00415 Sentence s;
00416 boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on);
00417
00418 Unit &unit = m_m2[i];
00419
00420 Matrix d;
00421 TrackMatrix bt;
00422 size_t distance = LevenshteinMatrix(unit.first, s, d, bt);
00423
00424 std::vector<Vertices> Vs(unit.third.size());
00425 std::vector<Edges> Es(unit.third.size());
00426
00427 if(distance > unit.first.size()) {
00428 std::cerr << "Levenshtein distance is greater than source size." << std::endl;
00429 stats[0] = 0;
00430 stats[1] = distance;
00431 stats[2] = 0;
00432 stats[3] = unit.first.size();
00433 return;
00434 } else if(distance > 0) {
00435 for(size_t j = 0; j < unit.third.size(); j++) {
00436 BuildGraph(bt, Vs[j], Es[j]);
00437 AddWeights(Es[j], unit, j);
00438 BellmanFord(Vs[j], Es[j]);
00439 }
00440 }
00441 AddStats(Es, unit, stats, i);
00442 stats[3] = unit.first.size();
00443 }
00444
00445
00446 float FScore(const Stats& stats) {
00447 float p = 1.0;
00448 if(stats[1] != 0)
00449 p = (float)stats[0] / (float)stats[1];
00450
00451 float r = 1.0;
00452 if(stats[2] != 0)
00453 r = (float)stats[0] / (float)stats[2];
00454
00455 float denom = (m_beta * m_beta * p + r);
00456 float f = 0.0;
00457 if(denom != 0)
00458 f = ((1 + m_beta * m_beta) * p * r) / denom;
00459 return f;
00460 }
00461
00462 void FScore(const Stats& stats, float &p, float &r, float &f) {
00463 p = 1.0;
00464 if(stats[1] != 0)
00465 p = (float)stats[0] / (float)stats[1];
00466
00467 r = 1.0;
00468 if(stats[2] != 0)
00469 r = (float)stats[0] / (float)stats[2];
00470
00471 float denom = (m_beta * m_beta * p + r);
00472 f = 0.0;
00473 if(denom != 0)
00474 f = ((1 + m_beta * m_beta) * p * r) / denom;
00475 }
00476 };
00477
00478 }
00479
00480 }