00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <cstdlib>
00013 #include <map>
00014 #include <set>
00015 #include <cmath>
00016 #include <fstream>
00017 #include <iostream>
00018 #include <vector>
00019 #include <sstream>
00020
00021
00022 using namespace std;
00023
00024
00025 double initTransitionProb;
00026 double LAMBDA;
00027
00028 double addLogProbs(double A , double B)
00029 {
00030
00031 if (A == B)
00032 return (A + log10(2.0));
00033
00034 if (A > B) {
00035 if (A - B > 6)
00036 return A;
00037 else
00038 return (A + log10(1+pow(10,(B-A))));
00039 }
00040
00041 else {
00042 if (B - A > 6)
00043 return B;
00044 else
00045 return (B + log10(1+pow(10,(A-B))));
00046 }
00047
00048 }
00049
00050
00051 class NodeStructure
00052 {
00053
00054 public:
00055
00056 NodeStructure() {};
00057 NodeStructure(vector <string> & s , vector <string> & t);
00058 double getPosterior() {
00059 return PPR;
00060 }
00061 void computeFwdBckProbs(map <string , double> & gammas, map <string, double> & alignmentCounts);
00062 void computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams);
00063 void print();
00064
00065 vector <string> source;
00066 vector <string> target;
00067 ~NodeStructure() {};
00068
00069 private:
00070
00071 double NTR;
00072 double PPR;
00073 double ALPHA;
00074 double BETA;
00075
00076 void computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts);
00077 double computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents);
00078 double FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents);
00079 double BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & chidren);
00080 double computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children);
00081 void getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges);
00082 void getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges);
00083 double getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge);
00084 void updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta);
00085 void computePosteriorProb();
00086 double scaleGamma(double g);
00087 void getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3);
00088
00089 };
00090
00091 void NodeStructure :: print()
00092 {
00093
00094 for (int i = 0; i < source.size(); i++)
00095 cout<<source[i];
00096
00097 cout<<"\t";
00098
00099 for (int i = 0; i < target.size(); i++)
00100 cout<<target[i];
00101
00102 cout<<"\t"<<pow(10,PPR)<<endl;
00103
00104 }
00105
00106 NodeStructure :: NodeStructure(vector <string> & s , vector <string> & t)
00107 {
00108 source = s;
00109 target = t;
00110 }
00111
00112
00113 void NodeStructure :: getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3)
00114 {
00115 if (v2.first - v1.first == 0)
00116 v3.first = -1;
00117 else
00118 v3.first = v2.first;
00119
00120 if (v2.second - v1.second == 0)
00121 v3.second = -1;
00122 else
00123 v3.second = v2.second;
00124 }
00125
00126 void NodeStructure :: computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts)
00127 {
00128
00129 vector < pair < int , int> > incomingEdges;
00130 map < pair <int , int> , double > :: iterator cIter;
00131 map < pair <int , int> , double > :: iterator pIter;
00132 pair <int , int> ST = make_pair (-1,-1);
00133 pair <int , int> edge;
00134
00135 children.erase(ST);
00136 double tProb;
00137 double alpha;
00138 double beta;
00139
00140 for (cIter = children.begin(); cIter != children.end(); cIter++) {
00141 ST = cIter->first;
00142
00143 getIncomingEdges (ST , incomingEdges);
00144 beta = cIter->second;
00145
00146 for (int i = 0; i< incomingEdges.size(); i++) {
00147 pIter = parents.find(incomingEdges[i]);
00148
00149 alpha = pIter->second;
00150 getEdge (incomingEdges[i] , ST , edge);
00151
00152 updateAlignmentCount(transitionProbs, alignmentCounts , edge , alpha , beta);
00153 }
00154 }
00155
00156 }
00157
00158 void NodeStructure :: computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams)
00159 {
00160
00161 NTR = 0.0;
00162
00163 for (int i = 0; i < source.size(); i++) {
00164 NTR += sourceUnigrams[source[i]];
00165 }
00166
00167 for (int i = 0; i < target.size(); i++) {
00168
00169 NTR += targetUnigrams[target[i]];
00170 }
00171
00172 }
00173
00174 double NodeStructure :: scaleGamma(double g)
00175 {
00176 double translit = log10 (1 - pow (10, PPR));
00177 return g + translit;
00178 }
00179 void NodeStructure :: computePosteriorProb()
00180 {
00181 double LAMBDA2 = log10(1 - pow(10, LAMBDA));
00182 double transliterate = LAMBDA2 + ALPHA;
00183 double translate = LAMBDA + NTR;
00184 double trans = transliterate - translate;
00185
00186
00187
00188 double prob = 1/(1+ pow(10 , trans));
00189 PPR = log10(prob);
00190
00191
00192 }
00193
00194 void NodeStructure :: computeFwdBckProbs(map <string , double> & gammas , map <string, double> & alignmentCounts)
00195 {
00196 pair <int , int> START = make_pair (source.size()-1 , target.size()-1);
00197 pair <int , int> END = make_pair (-1 , -1);
00198
00199 map < pair <int , int> , double > parents;
00200 parents[make_pair(-1,-1)] = 0.0;
00201 map < pair <int , int> , double > children;
00202 children[make_pair(source.size()-1,target.size()-1)] = 0.0;
00203
00204 ALPHA = computeFwdProbs(START , gammas, parents);
00205 BETA = computeBckProbs(END , gammas, children);
00206
00207 computePosteriorProb();
00208
00209 computeGammaForEdges(parents , children , gammas , alignmentCounts);
00210
00211 }
00212
00213 void NodeStructure :: getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges)
00214 {
00215 incomingEdges.clear();
00216
00217 if (ST.first == -1) {
00218 incomingEdges.push_back(make_pair(ST.first , ST.second-1));
00219 } else if (ST.second == -1) {
00220 incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
00221 } else {
00222 incomingEdges.push_back(make_pair(ST.first , ST.second-1));
00223 incomingEdges.push_back(make_pair(ST.first-1 , ST.second));
00224 incomingEdges.push_back(make_pair(ST.first-1 , ST.second-1));
00225 }
00226
00227 }
00228
00229 void NodeStructure :: getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges)
00230 {
00231
00232 if (ST.first == source.size()-1) {
00233 outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
00234 } else if (ST.second == target.size()-1) {
00235 outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
00236 } else {
00237 outgoingEdges.push_back(make_pair(ST.first , ST.second+1));
00238 outgoingEdges.push_back(make_pair(ST.first+1 , ST.second));
00239 outgoingEdges.push_back(make_pair(ST.first+1 , ST.second+1));
00240 }
00241
00242 }
00243
00244 void NodeStructure :: updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta)
00245 {
00246
00247 double tProb;
00248 double tgamma;
00249 double gamma;
00250 map <string , double> :: iterator aCounts;
00251 string query;
00252
00253 if (edge.first == -1)
00254 query = "NULL";
00255 else
00256 query = source[edge.first];
00257
00258 query += "-";
00259
00260 if (edge.second == -1)
00261 query += "NULL";
00262 else
00263 query += target[edge.second];
00264
00265
00266 if (transitionProbs.size() == 0)
00267 tProb = initTransitionProb;
00268 else
00269 tProb = transitionProbs[query];
00270
00271
00272 tgamma = alpha + tProb + beta - ALPHA;
00273 gamma = scaleGamma(tgamma);
00274
00275
00276
00277 aCounts = alignmentCounts.find(query);
00278
00279 if (aCounts == alignmentCounts.end()) {
00280 alignmentCounts[query] = gamma;
00281 } else {
00282 double temp = aCounts->second;
00283 aCounts->second = addLogProbs(temp , gamma);
00284 }
00285
00286 }
00287
00288 double NodeStructure :: getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge)
00289 {
00290
00291 if (transitionProbs.size() == 0)
00292 return initTransitionProb;
00293
00294 string query;
00295
00296 if (edge.first == -1)
00297 query = "NULL";
00298 else
00299 query = source[edge.first];
00300
00301 query += "-";
00302
00303 if (edge.second == -1)
00304 query += "NULL";
00305 else
00306 query += target[edge.second];
00307
00308
00309 return transitionProbs[query];
00310 }
00311
00312 double NodeStructure :: FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents)
00313 {
00314
00315 double thisAlpha;
00316 double alpha = -2000;
00317 vector < pair < int , int> > incomingEdges;
00318 pair <int , int> edge;
00319
00320
00321 getIncomingEdges (TS , incomingEdges);
00322
00323 for (int k = 0; k < incomingEdges.size(); k++) {
00324 thisAlpha = parents[incomingEdges[k]];
00325 getEdge (incomingEdges[k], TS , edge);
00326 thisAlpha += getTransitionProb(gammas , edge);
00327 double temp = alpha;
00328 alpha = addLogProbs(temp , thisAlpha);
00329
00330 }
00331
00332 return alpha;
00333 }
00334
00335 double NodeStructure :: computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents)
00336 {
00337
00338 pair <int , int> TS;
00339 double alpha;
00340
00341 for (int i = 0; i < source.size(); i++) {
00342 TS = make_pair (i , -1);
00343 alpha = FwdProb (TS, gammas, parents);
00344 parents[TS] = alpha;
00345 }
00346
00347 for (int i = 0; i < target.size(); i++) {
00348 TS = make_pair (-1 , i);
00349 alpha = FwdProb (TS, gammas, parents);
00350 parents[TS] = alpha;
00351 }
00352
00353 for (int i = 0; i < source.size(); i++) {
00354 for (int j = 0; j < target.size(); j++) {
00355 TS = make_pair (i , j);
00356 alpha = FwdProb (TS, gammas, parents);
00357 parents[TS] = alpha;
00358 }
00359 }
00360
00361 return parents[ST];
00362 }
00363
00364 double NodeStructure :: BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & children)
00365 {
00366
00367 double thisBeta;
00368 double beta = -2000;
00369 vector < pair < int , int> > outgoingEdges;
00370 pair <int , int> edge;
00371
00372 getOutgoingEdges (TS , outgoingEdges);
00373
00374 for (int k = 0; k < outgoingEdges.size(); k++) {
00375 thisBeta = children[outgoingEdges[k]];
00376 getEdge (TS , outgoingEdges[k], edge);
00377 thisBeta += getTransitionProb(gammas , edge);
00378 double temp = beta;
00379 beta = addLogProbs(temp , thisBeta);
00380
00381 }
00382
00383 return beta;
00384 }
00385
00386
00387 double NodeStructure :: computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children)
00388 {
00389
00390 pair <int , int> TS;
00391 double beta;
00392
00393 for (int i = source.size()-2; i >= -1; i--) {
00394 TS = make_pair (i , target.size()-1);
00395 beta = BckProb (TS, gammas, children);
00396 children[TS] = beta;
00397 }
00398
00399 for (int i = target.size()-2; i >=-1; i--) {
00400 TS = make_pair (source.size()-1 , i);
00401 beta = BckProb (TS, gammas, children);
00402 children[TS] = beta;
00403 }
00404
00405 for (int i = source.size()-2 ; i >= -1 ; i--) {
00406 for (int j = target.size()-2 ; j >= -1; j--) {
00407 TS = make_pair (i , j);
00408 beta = BckProb (TS, gammas, children);
00409 children[TS] = beta;
00410 }
00411 }
00412
00413 return children[ST];
00414 }
00415
00416
00417
00418 void loadInput(const char * fileName, vector <string> & input)
00419 {
00420
00421
00422
00423 ifstream sr (fileName);
00424 string line;
00425
00426 if(sr.is_open()) {
00427 while(getline(sr , line )) {
00428 input.push_back(line);
00429 }
00430
00431 sr.close();
00432 } else {
00433 cout<<"Unable to read "<<fileName<<endl;
00434 exit(1);
00435 }
00436
00437 }
00438
00439 void printGammas(map <string, double> & alignmentCounts)
00440 {
00441 map <string , double> :: iterator aCounts;
00442
00443 for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) {
00444 cout<<aCounts->first<<" "<<aCounts->second<<endl;
00445 }
00446 }
00447
00448
00449 void getWords(string s, vector <string> & currInput)
00450 {
00451
00452
00453
00454 istringstream iss(s);
00455 currInput.clear();
00456 do {
00457 string sub;
00458 iss >> sub;
00459 currInput.push_back(sub);
00460
00461 } while (iss);
00462
00463 currInput.pop_back();
00464 }
00465
00466 double getInitTransitionProb(int sourceToken, int targetToken)
00467 {
00468 double prod = sourceToken * targetToken;
00469 return log10(1/prod);
00470 }
00471
00472 void runIteration(map <int , NodeStructure> & graph , map <string , double> & gammas , int size)
00473 {
00474
00475 map <string, double> alignmentCounts;
00476 map <int , NodeStructure> :: iterator i;
00477 map <string , double> :: iterator aCounts;
00478 double sum = -2000.0;
00479 double tPPR = -2000.0;
00480
00481 for (i = graph.begin(); i != graph.end(); i++) {
00482
00483 i->second.computeFwdBckProbs(gammas , alignmentCounts);
00484 double temp = tPPR;
00485
00486 tPPR = addLogProbs(graph[i->first].getPosterior() , temp);
00487
00488 }
00489
00490 for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) {
00491 double temp = sum;
00492 sum = addLogProbs(aCounts->second, temp);
00493 }
00494
00495
00496 for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) {
00497 aCounts->second = aCounts->second - sum;
00498 }
00499
00500 gammas.clear();
00501 gammas = alignmentCounts;
00502
00503 LAMBDA = tPPR - log10(size);
00504 }
00505
00506
00507 void setNTRProbabilities(map <int , NodeStructure> & graph , map <string , double> & sourceTypes , map <string , double > & targetTypes, double sourceTokens, double targetTokens)
00508 {
00509
00510 map <string , double> :: iterator i;
00511 map <int , NodeStructure> :: iterator j;
00512
00513
00514 for (i = sourceTypes.begin(); i!= sourceTypes.end(); i++) {
00515 i->second = log10(i->second/sourceTokens);
00516 }
00517
00518 for (i = targetTypes.begin(); i!= targetTypes.end(); i++) {
00519 i->second = log10(i->second/targetTokens);
00520 }
00521
00522
00523 for (j = graph.begin(); j != graph.end(); j++) {
00524 j->second.computeNonTransliterationProb(sourceTypes , targetTypes);
00525 }
00526
00527 }
00528
00529 void printPosterior(map <int , NodeStructure> & graph)
00530 {
00531
00532 map <int , NodeStructure> :: iterator i;
00533
00534 for (i = graph.begin(); i != graph.end(); i++)
00535 graph[i->first].print();
00536 }
00537
00538
00539 int main(int argc, char * argv[])
00540 {
00541
00542 vector <string> input;
00543 vector <string> source;
00544 vector <string> target;
00545 map <string , double> sourceTypes;
00546 map <string , double> targetTypes;
00547 set < vector <string> > tgt;
00548 set < vector <string> > src;
00549 double sourceTokens = 0;
00550 double targetTokens = 0;
00551 map <int , NodeStructure> graph;
00552 map <string , double> gammas;
00553
00554 loadInput(argv[1],input);
00555
00556 cerr<<"Constructing Graph "<<endl;
00557
00558 for(int i=0; i<input.size(); i+=2) {
00559
00560
00561
00562
00563
00564 getWords(input[i],source);
00565 getWords(input[i+1],target);
00566
00567 if (src.find(source) == src.end()) {
00568 for (int j = 0; j< source.size(); j++)
00569 sourceTypes[source[j]]++;
00570 src.insert(source);
00571 sourceTokens += source.size();
00572 }
00573
00574 if (tgt.find(target) == tgt.end()) {
00575 for (int j = 0; j< target.size(); j++)
00576 targetTypes[target[j]]++;
00577
00578 tgt.insert(target);
00579 targetTokens += target.size();
00580 }
00581
00582 NodeStructure obj (source,target);
00583 graph[i] = obj;
00584
00585 }
00586
00587 setNTRProbabilities(graph, sourceTypes, targetTypes, sourceTokens, targetTokens);
00588 initTransitionProb = getInitTransitionProb(sourceTypes.size()+1, targetTypes.size()+1);
00589
00590 LAMBDA = log10(0.5);
00591
00592
00593 for (int i = 0; i< 10; i++) {
00594
00595 cerr<<"Computing Probs : iteration "<<i+1<<endl;
00596 runIteration(graph , gammas , input.size()/2);
00597
00598 }
00599
00600 printPosterior(graph);
00601 cerr<<"Finished..."<<endl;
00602
00603 return 0;
00604 }
00605