00001 #include <fstream>
00002 #include <iostream>
00003 #include <vector>
00004 #include <sstream>
00005 #include <map>
00006 #include <set>
00007 #include <cstdlib>
00008
00009
00010 using namespace std;
00011
00012
00013 int stringToInteger(string s)
00014 {
00015
00016 istringstream buffer(s);
00017 int some_int;
00018 buffer >> some_int;
00019 return some_int;
00020 }
00021
00022 void loadInput(const char * fileName, vector <string> & input)
00023 {
00024
00025 ifstream sr (fileName);
00026 string line;
00027
00028 if(sr.is_open()) {
00029 while(getline(sr , line )) {
00030 input.push_back(line);
00031 }
00032
00033 sr.close();
00034 } else {
00035 cout<<"Unable to read "<<fileName<<endl;
00036 exit(1);
00037 }
00038
00039 }
00040
00041 void getWords(string s, vector <string> & currInput)
00042 {
00043 istringstream iss(s);
00044 currInput.clear();
00045 do {
00046 string sub;
00047 iss >> sub;
00048 currInput.push_back(sub);
00049
00050 } while (iss);
00051
00052 currInput.pop_back();
00053 }
00054
00055 void getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
00056 {
00057 set <int> :: iterator iter;
00058
00059 int sz = eSide.size();
00060 vector <int> t;
00061
00062 for (iter = eSide.begin(); iter != eSide.end(); iter++) {
00063 t = tS[*iter];
00064
00065 for (int i = 0; i < t.size(); i++) {
00066 fSide.insert(t[i]);
00067 }
00068
00069 }
00070
00071 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00072
00073 t = sT[*iter];
00074
00075 for (int i = 0 ; i<t.size(); i++) {
00076 eSide.insert(t[i]);
00077 }
00078
00079 }
00080
00081 if (eSide.size () > sz) {
00082 getMeCepts(eSide,fSide,tS,sT);
00083 }
00084
00085 }
00086
00087 void constructCepts(vector < pair < set <int> , set <int> > > & ceptsInPhrase, set <int> & sourceNullWords, set <int> & targetNullWords, vector <string> & alignment, int eSize, int fSize)
00088 {
00089
00090 ceptsInPhrase.clear();
00091 sourceNullWords.clear();
00092 targetNullWords.clear();
00093
00094 vector <int> align;
00095 vector <string> mAlign;
00096
00097 std::map <int , vector <int> > sT;
00098 std::map <int , vector <int> > tS;
00099 std::set <int> eSide;
00100 std::set <int> fSide;
00101 std::set <int> :: iterator iter;
00102 std :: map <int , vector <int> > :: iterator iter2;
00103 std :: pair < set <int> , set <int> > cept;
00104 int src;
00105 int tgt;
00106 ceptsInPhrase.clear();
00107 int res;
00108
00109 for (int j=0; j<alignment.size(); j+=1) {
00110 res = alignment[j].find("-");
00111 mAlign.push_back(alignment[j].substr(0,res));
00112 mAlign.push_back(alignment[j].substr(res+1));
00113 }
00114
00115 for (int j=0; j<mAlign.size(); j+=2) {
00116 align.push_back(stringToInteger(mAlign[j+1]));
00117 align.push_back(stringToInteger(mAlign[j]));
00118 }
00119
00120 for (int i = 0; i < align.size(); i+=2) {
00121 src = align[i];
00122 tgt = align[i+1];
00123 tS[tgt].push_back(src);
00124 sT[src].push_back(tgt);
00125 }
00126
00127 for (int i = 0; i< fSize; i++) {
00128 if (sT.find(i) == sT.end()) {
00129 targetNullWords.insert(i);
00130 }
00131 }
00132
00133 for (int i = 0; i< eSize; i++) {
00134 if (tS.find(i) == tS.end()) {
00135 sourceNullWords.insert(i);
00136 }
00137 }
00138
00139
00140 while (tS.size() != 0 && sT.size() != 0) {
00141
00142 iter2 = tS.begin();
00143
00144 eSide.clear();
00145 fSide.clear();
00146 eSide.insert (iter2->first);
00147
00148 getMeCepts(eSide, fSide, tS , sT);
00149
00150 for (iter = eSide.begin(); iter != eSide.end(); iter++) {
00151 iter2 = tS.find(*iter);
00152 tS.erase(iter2);
00153 }
00154
00155 for (iter = fSide.begin(); iter != fSide.end(); iter++) {
00156 iter2 = sT.find(*iter);
00157 sT.erase(iter2);
00158 }
00159
00160 cept = make_pair (fSide , eSide);
00161 ceptsInPhrase.push_back(cept);
00162 }
00163
00164 }
00165
00166 void getOneToOne(vector < pair < set <int> , set <int> > > & ceptsInPhrase , vector <string> & currF , vector <string> & currE, set <string> & one)
00167 {
00168 string temp;
00169
00170 for (int i = 0; i< ceptsInPhrase.size(); i++) {
00171 if (ceptsInPhrase[i].first.size() == 1 && ceptsInPhrase[i].second.size() == 1) {
00172 temp = currF[(*ceptsInPhrase[i].second.begin())] + "\t" + currE[(*ceptsInPhrase[i].first.begin())];
00173
00174 if (one.find(temp) == one.end())
00175 one.insert(temp);
00176 }
00177 }
00178
00179 }
00180
00181 void printOneToOne ( set <string> & one)
00182 {
00183 set <string> :: iterator iter;
00184
00185 for (iter = one.begin(); iter != one.end(); iter++) {
00186 cout<<*iter<<endl;
00187 }
00188 }
00189
00190 int main(int argc, char * argv[])
00191 {
00192
00193 vector <string> e;
00194 vector <string> f;
00195 vector <string> a;
00196 vector < pair < set <int> , set <int> > > ceptsInPhrase;
00197 vector < pair < string , vector <int> > > gCepts;
00198
00199 set <int> sourceNullWords;
00200 set <int> targetNullWords;
00201
00202 vector <string> currE;
00203 vector <string> currF;
00204 vector <string> currA;
00205 set <string> one;
00206
00207 loadInput(argv[1],f);
00208 loadInput(argv[2],e);
00209 loadInput(argv[3],a);
00210
00211
00212 for (int i=0; i<a.size(); i++) {
00213
00214
00215 getWords(e[i],currE);
00216 getWords(f[i],currF);
00217 getWords(a[i],currA);
00218
00219 if (i % 100000 == 0) {
00220 cerr<<"Processing "<<i<<endl;
00221 }
00222 constructCepts(ceptsInPhrase, sourceNullWords , targetNullWords, currA , currE.size(), currF.size());
00223 getOneToOne(ceptsInPhrase , currF , currE, one);
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233 }
00234
00235 printOneToOne(one);
00236
00237
00238 return 0;
00239
00240 }