00001 #include <fstream>
00002 #include <iostream>
00003 #include <vector>
00004 #include <sstream>
00005 #include <map>
00006 #include <set>
00007 #include <cstdlib>
00008
00009
00010 using namespace std;
00011
00012
00013 int stringToInteger(string s)
00014 {
00015
00016 istringstream buffer(s);
00017 int some_int;
00018 buffer >> some_int;
00019 return some_int;
00020 }
00021
00022 void loadInput(const char * fileName, vector <string> & input)
00023 {
00024
00025 ifstream sr (fileName);
00026 string line;
00027
00028 if(sr.is_open()) {
00029 while(getline(sr , line )) {
00030 input.push_back(line);
00031 }
00032
00033 sr.close();
00034 } else {
00035 cout<<"Unable to read "<<fileName<<endl;
00036 exit(1);
00037 }
00038
00039 }
00040
00041 void getWords(string s, vector <string> & currInput)
00042 {
00043 istringstream iss(s);
00044 currInput.clear();
00045 do {
00046 string sub;
00047 iss >> sub;
00048 currInput.push_back(sub);
00049
00050 } while (iss);
00051
00052 currInput.pop_back();
00053 }
00054
00055 string getTranslation(int index, vector < pair <string , vector <int> > > & gCepts , vector <string> & currF , map <string,int> & singletons)
00056 {
00057
00058 string translation = "";
00059
00060 vector <int> fSide = gCepts[index].second;
00061 vector <int> :: iterator iter;
00062
00063 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00064 if (iter != fSide.begin())
00065 translation += "^_^";
00066
00067 translation+= currF[*iter];
00068 }
00069
00070 if(singletons.find(translation)==singletons.end()) {
00071 return "_TRANS_" + gCepts[index].first + "_TO_" + translation + " ";
00072 }
00073
00074 else {
00075
00076 return "_TRANS_SLF_ ";
00077 }
00078 }
00079
00080
00081
00082 int closestGap(map <int,string> gap,int j1, int & gp)
00083 {
00084
00085 int dist=1172;
00086 int value=-1;
00087 int temp=0;
00088 gp=0;
00089 int opGap=0;
00090
00091 map <int,string> :: iterator iter;
00092
00093 iter=gap.end();
00094
00095 do {
00096 iter--;
00097
00098
00099 if(iter->first==j1 and iter->second=="Unfilled") {
00100 opGap++;
00101 gp = opGap;
00102 return j1;
00103 }
00104
00105 if(iter->second =="Unfilled") {
00106 opGap++;
00107 temp = iter->first - j1;
00108
00109 if(temp<0)
00110 temp=temp * -1;
00111
00112 if(dist>temp && iter->first < j1) {
00113 dist=temp;
00114 value=iter->first;
00115 gp=opGap;
00116 }
00117 }
00118
00119
00120 } while(iter!=gap.begin());
00121
00122
00123 return value;
00124 }
00125
00126
00127 void generateStory(vector <pair <string , vector <int> > > & gCepts, set <int> & targetNullWords, vector<string> & currF, map <string,int> & singletons)
00128 {
00129
00130 int fl = 0;
00131 int i = 0;
00132 int j = 0;
00133 int N = gCepts.size();
00134 int k = 0;
00135 int E = 0;
00136 int j1 = 0;
00137 int Li =0;
00138 int Lj=0;
00139 map <int,int > generated;
00140 map <int,string> gap;
00141 map <int,int> :: iterator iter;
00142 int gp=0;
00143
00144
00145 while (targetNullWords.find(j) != targetNullWords.end()) {
00146 cout<<"_INS_"<<currF[j]<<" ";
00147 generated[j]=-1;
00148 j=j+1;
00149 }
00150
00151 while (i < gCepts.size() && gCepts[i].second.size() == 0) {
00152 cout<<"_DEL_"<<gCepts[i].first<<" ";
00153 i=i+1;
00154 }
00155
00156 E=j;
00157
00158 while (i<N) {
00159
00160
00161
00162
00163 Li = gCepts[i].second.size();
00164 j1 = gCepts[i].second[k];
00165
00166
00167
00168 if(j<j1) {
00169 iter = generated.find(j);
00170 if( iter == generated.end()) {
00171 cout<<"_INS_GAP_ ";
00172 gap[j] = "Unfilled";
00173 }
00174
00175 if (j==E) {
00176 j=j1;
00177 } else {
00178 cout<<"_JMP_FWD_ ";
00179 j=E;
00180 }
00181
00182 }
00183
00184 if(j1<j) {
00185 iter = generated.find(j);
00186 if(j<E && iter == generated.end()) {
00187
00188 cout<<"_INS_GAP_ ";
00189 gap[j]="Unfilled";
00190 }
00191
00192 j=closestGap(gap,j1,gp);
00193
00194 cout<<"_JMP_BCK_"<<gp<<" ";
00195
00196 if(j==j1)
00197 gap[j]="Filled";
00198
00199 }
00200
00201 if(j<j1) {
00202 cout<<"_INS_GAP_ ";
00203 gap[j] = "Unfilled";
00204 j=j1;
00205 }
00206
00207 if(k==0) {
00208 cout<<getTranslation(i, gCepts,currF,singletons);
00209 } else {
00210 cout<<"_CONT_CEPT_ ";
00211 }
00212 generated[j]=i;
00213 j=j+1;
00214 k=k+1;
00215
00216 while(targetNullWords.find(j) != targetNullWords.end()) {
00217
00218 cout<<"_INS_"<<currF[j]<<" ";
00219 generated[j]=-1;
00220 j=j+1;
00221 }
00222
00223 if(E<j)
00224 E=j;
00225
00226 if(k==Li) {
00227 i=i+1;
00228 k=0;
00229
00230 while(i < gCepts.size() && gCepts[i].second.size() == 0) {
00231 cout<<"_DEL_"<<gCepts[i].first<<" ";
00232 i=i+1;
00233
00234 }
00235
00236 }
00237
00238 }
00239
00240 cout<<endl;
00241 }
00242
00243
00244
00245 void ceptsInGenerativeStoryFormat(vector < pair < set <int> , set <int> > > & ceptsInPhrase , vector < pair < string , vector <int> > > & gCepts , set <int> & sourceNullWords, vector <string> & currE)
00246 {
00247
00248 gCepts.clear();
00249 set <int> eSide;
00250 set <int> fSide;
00251 std::set <int> :: iterator iter;
00252 string english;
00253 vector <int> germanIndex;
00254 int engIndex = 0;
00255 int prev=0;
00256 int curr;
00257 set <int> engDone;
00258
00259
00260 for (int i = 0; i< ceptsInPhrase.size(); i++) {
00261 english = "";
00262 germanIndex.clear();
00263 fSide = ceptsInPhrase[i].first;
00264 eSide = ceptsInPhrase[i].second;
00265
00266
00267 while(engIndex < *eSide.begin()) {
00268
00269
00270 while(engDone.find(engIndex) != engDone.end())
00271 engIndex++;
00272
00273 while(sourceNullWords.find(engIndex) != sourceNullWords.end()) {
00274 english = currE[engIndex];
00275 engIndex++;
00276 gCepts.push_back(make_pair (english , germanIndex));
00277 english = "";
00278 }
00279 }
00280
00281 for (iter = eSide.begin(); iter != eSide.end(); iter++) {
00282 curr = *iter;
00283
00284 if(iter != eSide.begin()) {
00285 english += "^_^";
00286
00287 if (prev == curr-1) {
00288 prev++;
00289 engIndex++;
00290 } else
00291 engDone.insert(curr);
00292 } else {
00293 prev = curr;
00294
00295 engIndex = prev+1;
00296 }
00297 english +=currE[curr];
00298
00299 }
00300
00301 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00302 germanIndex.push_back(*iter);
00303 }
00304
00305 gCepts.push_back(make_pair (english , germanIndex));
00306
00307
00308 }
00309
00310 english = "";
00311 germanIndex.clear();
00312
00313
00314
00315
00316 while(engIndex < currE.size()) {
00317
00318 while(engDone.find(engIndex) != engDone.end())
00319 engIndex++;
00320
00321 while(sourceNullWords.find(engIndex) != sourceNullWords.end()) {
00322 english = currE[engIndex];
00323
00324 engIndex++;
00325 gCepts.push_back(make_pair (english , germanIndex));
00326 english = "";
00327 }
00328 }
00329
00330 }
00331
00332 void printCepts(vector < pair < string , vector <int> > > & gCepts , vector <string> & currF)
00333 {
00334
00335 string eSide;
00336 vector <int> fSide;
00337
00338 for (int i = 0; i < gCepts.size(); i++) {
00339
00340 fSide = gCepts[i].second;
00341 eSide = gCepts[i].first;
00342
00343 cout<<eSide;
00344 cout<<" <---> ";
00345
00346 for (int j = 0; j < fSide.size(); j++) {
00347 cout<<currF[fSide[j]]<<" ";
00348 }
00349
00350 cout<<endl;
00351 }
00352
00353 }
00354
00355 void getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
00356 {
00357 set <int> :: iterator iter;
00358
00359 int sz = eSide.size();
00360 vector <int> t;
00361
00362 for (iter = eSide.begin(); iter != eSide.end(); iter++) {
00363 t = tS[*iter];
00364
00365 for (int i = 0; i < t.size(); i++) {
00366 fSide.insert(t[i]);
00367 }
00368
00369 }
00370
00371 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00372
00373 t = sT[*iter];
00374
00375 for (int i = 0 ; i<t.size(); i++) {
00376 eSide.insert(t[i]);
00377 }
00378
00379 }
00380
00381 if (eSide.size () > sz) {
00382 getMeCepts(eSide,fSide,tS,sT);
00383 }
00384
00385 }
00386
00387 void constructCepts(vector < pair < set <int> , set <int> > > & ceptsInPhrase, set <int> & sourceNullWords, set <int> & targetNullWords, vector <string> & alignment, int eSize, int fSize)
00388 {
00389
00390 ceptsInPhrase.clear();
00391 sourceNullWords.clear();
00392 targetNullWords.clear();
00393
00394 vector <int> align;
00395
00396 std::map <int , vector <int> > sT;
00397 std::map <int , vector <int> > tS;
00398 std::set <int> eSide;
00399 std::set <int> fSide;
00400 std::set <int> :: iterator iter;
00401 std :: map <int , vector <int> > :: iterator iter2;
00402 std :: pair < set <int> , set <int> > cept;
00403 int src;
00404 int tgt;
00405 ceptsInPhrase.clear();
00406
00407 for (int j=0; j<alignment.size(); j+=2) {
00408 align.push_back(stringToInteger(alignment[j+1]));
00409 align.push_back(stringToInteger(alignment[j]));
00410 }
00411
00412 for (int i = 0; i < align.size(); i+=2) {
00413 src = align[i];
00414 tgt = align[i+1];
00415 tS[tgt].push_back(src);
00416 sT[src].push_back(tgt);
00417 }
00418
00419 for (int i = 0; i< fSize; i++) {
00420 if (sT.find(i) == sT.end()) {
00421 targetNullWords.insert(i);
00422 }
00423 }
00424
00425 for (int i = 0; i< eSize; i++) {
00426 if (tS.find(i) == tS.end()) {
00427 sourceNullWords.insert(i);
00428 }
00429 }
00430
00431
00432 while (tS.size() != 0 && sT.size() != 0) {
00433
00434 iter2 = tS.begin();
00435
00436 eSide.clear();
00437 fSide.clear();
00438 eSide.insert (iter2->first);
00439
00440 getMeCepts(eSide, fSide, tS , sT);
00441
00442 for (iter = eSide.begin(); iter != eSide.end(); iter++) {
00443 iter2 = tS.find(*iter);
00444 tS.erase(iter2);
00445 }
00446
00447 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00448 iter2 = sT.find(*iter);
00449 sT.erase(iter2);
00450 }
00451
00452 cept = make_pair (fSide , eSide);
00453 ceptsInPhrase.push_back(cept);
00454 }
00455
00456 }
00457
00458 int main(int argc, char * argv[])
00459 {
00460
00461 vector <string> e;
00462 vector <string> f;
00463 vector <string> a;
00464 vector <string> singletons;
00465 map <string,int> sTons;
00466 vector < pair < set <int> , set <int> > > ceptsInPhrase;
00467 vector < pair < string , vector <int> > > gCepts;
00468
00469 set <int> sourceNullWords;
00470 set <int> targetNullWords;
00471
00472 vector <string> currE;
00473 vector <string> currF;
00474 vector <string> currA;
00475
00476 loadInput(argv[4],singletons);
00477
00478 for(int i=0; i<singletons.size(); i++)
00479 sTons[singletons[i]]=i;
00480
00481 loadInput(argv[1],e);
00482 loadInput(argv[2],f);
00483 loadInput(argv[3],a);
00484
00485
00486 for (int i=0; i<a.size(); i++) {
00487
00488
00489 getWords(e[i],currE);
00490 getWords(f[i],currF);
00491 getWords(a[i],currA);
00492
00493 constructCepts(ceptsInPhrase, sourceNullWords , targetNullWords, currA , currE.size(), currF.size());
00494
00495 ceptsInGenerativeStoryFormat(ceptsInPhrase , gCepts , sourceNullWords, currE);
00496
00497
00498 generateStory(gCepts, targetNullWords ,currF,sTons);
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509 }
00510
00511
00512 return 0;
00513
00514 }