00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <fstream>
00010 #include <sstream>
00011 #include <cmath>
00012 #include "Permutation.h"
00013 #include "Util.h"
00014
00015 using namespace std;
00016
00017 namespace MosesTuning
00018 {
00019
00020
00021 Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
00022 {
00023 if (sourceLength > 0) {
00024 set(alignment, sourceLength);
00025 }
00026 m_targetLength = targetLength;
00027 }
00028
00029 size_t Permutation::getLength() const
00030 {
00031 return int(m_array.size());
00032 }
00033 void Permutation::dump() const
00034 {
00035 int j=0;
00036 for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++) {
00037 cout << "(";
00038 cout << j << ":" << *i ;
00039 cout << "), ";
00040 j++;
00041 }
00042 cout << endl;
00043 }
00044
00045
00046
00047
00048
00049
00050
00051
00052 void Permutation::set(const string & alignment,const int sourceLength)
00053 {
00054
00055
00056
00057 if(sourceLength <= 0) {
00058
00059 cerr << "Source sentence length not positive:"<< sourceLength << endl;
00060 exit(0);
00061 }
00062
00063 if (alignment.length() <= 0) {
00064
00065 cerr << "Alignment string empty:"<< alignment << endl;
00066 }
00067
00068
00069 string buf;
00070 stringstream ss(alignment);
00071 vector<string> tokens;
00072 while (ss >> buf)
00073 tokens.push_back(buf);
00074
00075 vector<int> tempPerm(sourceLength, -1);
00076
00077 for (size_t i=0; i<tokens.size(); i++) {
00078 string temp = tokens[i];
00079 int posDelimeter = temp.find("-");
00080 if(posDelimeter == int(string::npos)) {
00081 cerr << "Delimiter not found - :"<< tokens[i] << endl;
00082 exit(1);
00083 }
00084 int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
00085 int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
00086
00087 if (sourcePos > sourceLength) {
00088 cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
00089 cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
00090 exit(1);
00091 }
00092
00093
00094 if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos) {
00095 tempPerm[sourcePos] = targetPos;
00096 }
00097 }
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110 int last=0;
00111 m_array.assign(sourceLength, -1);
00112
00113 multimap<int, int> invMap;
00114 multimap<int, int>::iterator it;
00115
00116 for (size_t i=0; i<tempPerm.size(); i++) {
00117 if (tempPerm[i] == -1) {
00118 tempPerm[i] = last;
00119 } else {
00120 last = tempPerm[i];
00121 }
00122
00123
00124 invMap.insert(pair<int,int>(tempPerm[i],int(i)));
00125 }
00126
00127
00128
00129
00130
00131
00132
00133 int i=0;
00134
00135 for ( it=invMap.begin() ; it != invMap.end(); it++ ) {
00136
00137
00138 m_array[(*it).second] = i;
00139 i++;
00140 }
00141
00142 bool ok = checkValidPermutation(m_array);
00143
00144 if (!ok) {
00145 throw runtime_error(" Created invalid permutation");
00146 }
00147 }
00148
00149
00150 vector<int> Permutation::invert(const vector<int> & inVector)
00151 {
00152 vector<int> outVector(inVector.size());
00153 for (size_t i=0; i<inVector.size(); i++) {
00154 outVector[inVector[i]] = int(i);
00155 }
00156 return outVector;
00157 }
00158
00159
00160
00161 bool Permutation::checkValidPermutation(vector<int> const & inVector)
00162 {
00163 vector<int> test(inVector.size(),-1);
00164 for (size_t i=0; i< inVector.size(); i++) {
00165
00166 if (test[inVector[i]] > -1) {
00167 cerr << "Permutation error: multiple entries of same value\n" << endl;
00168 return false;
00169 }
00170 test[inVector[i]] ++;
00171 }
00172 for (size_t i=0; i<inVector.size(); i++) {
00173
00174 if (test[inVector[i]] == -1) {
00175 cerr << "Permutation error: missing values\n" << endl;
00176 return false;
00177 }
00178 }
00179 return true;
00180 }
00181
00182
00183
00184
00185 float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const
00186 {
00187 float score=0;
00188
00189
00190 bool debug=false;
00191 if (debug) {
00192 cout << "*****Permutation::distance" <<endl;
00193 cout << "Hypo:" << endl;
00194 dump();
00195 cout << "Ref: " << endl;
00196 permCompare.dump();
00197 }
00198
00199 if (type == HAMMING_DISTANCE) {
00200 score = calculateHamming(permCompare);
00201 } else if (type == KENDALL_DISTANCE) {
00202 score = calculateKendall(permCompare);
00203 } else {
00204 throw runtime_error("Distance type not valid");
00205 }
00206
00207 float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;
00208 if (brevityPenalty < 0.0) {
00209 score = score * exp(brevityPenalty);
00210 }
00211
00212 if (debug) {
00213 cout << "Distance type:" << type << endl;
00214 cout << "Score: "<< score << endl;
00215 }
00216 return score;
00217 }
00218
00219
00220 float Permutation::calculateHamming(const Permutation & compare) const
00221 {
00222 float score=0;
00223 vector<int> compareArray = compare.getArray();
00224 if (getLength() != compare.getLength()) {
00225 cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
00226 throw runtime_error("Length of permutations not equal");
00227 }
00228 if (getLength() == 0) {
00229 cerr << "Empty permutation" << endl;
00230 return 0;
00231 }
00232 for (size_t i=0; i<getLength(); i++) {
00233 if (m_array[i] != compareArray[i]) {
00234 score++;
00235 }
00236
00237 }
00238 score = 1 - (score / getLength());
00239 return score;
00240 }
00241
00242 float Permutation::calculateKendall(const Permutation & compare) const
00243 {
00244 float score=0;
00245 vector<int> compareArray = compare.getArray();
00246 if (getLength() != compare.getLength()) {
00247 cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
00248 throw runtime_error("Length of permutations not equal");
00249 }
00250 if (getLength() == 0) {
00251 cerr << "Empty permutation" << endl;
00252 return 0;
00253 }
00254 if (getLength() == 1) {
00255 cerr << "One-word sentence. Kendall score = 1" << endl;
00256 return 1;
00257 }
00258 for (size_t i=0; i<getLength(); i++) {
00259 for (size_t j=0; j<getLength(); j++) {
00260 if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j])) {
00261 score++;
00262 }
00263 }
00264 }
00265 score = (score / ((getLength()*getLength() - getLength()) /2 ) );
00266
00267 score = sqrt (score);
00268 score = 1 - score;
00269
00270 return score;
00271 }
00272
00273 vector<int> Permutation::getArray() const
00274 {
00275 vector<int> ret = m_array;
00276 return ret;
00277 }
00278
00279
00280
00281
00282
00283 string Permutation::convertMosesToStandard(string const & alignment)
00284 {
00285 if (alignment.length() == 0) {
00286 cerr << "Alignment input string empty" << endl;
00287 }
00288 string working = alignment;
00289 string out;
00290
00291 stringstream oss;
00292 while (working.length() > 0) {
00293 string align;
00294 getNextPound(working,align," ");
00295
00296
00297 if (align.length() > 0) {
00298 size_t posDelimeter = align.find("=");
00299 if(posDelimeter== string::npos) {
00300 cerr << "Delimiter not found = :"<< align << endl;
00301 exit(0);
00302 }
00303 int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos;
00304 string sourcePoss = align.substr(0, posDelimeter);
00305 string targetPoss = align.substr(posDelimeter+1);
00306 posDelimeter = sourcePoss.find("-");
00307 if(posDelimeter < string::npos) {
00308 firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str());
00309 lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str());
00310 } else {
00311 firstSourcePos = atoi(sourcePoss.c_str());
00312 lastSourcePos = firstSourcePos;
00313 }
00314 posDelimeter = targetPoss.find("-");
00315 if(posDelimeter < string::npos) {
00316 firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str());
00317 lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str());
00318 } else {
00319 firstTargetPos = atoi(targetPoss.c_str());
00320 lastTargetPos = firstTargetPos;
00321 }
00322 for (int i = firstSourcePos; i <= lastSourcePos; i++) {
00323 for (int j = firstTargetPos; j <= lastTargetPos; j++) {
00324 oss << i << "-" << j << " ";
00325 }
00326 }
00327
00328 }
00329 }
00330 out = oss.str();
00331
00332
00333 return out;
00334 }
00335
00336 }
00337