00001
00002 #include <iostream>
00003 #include <fstream>
00004 #include <cassert>
00005 #include <vector>
00006 #include <string>
00007 #include "moses/Util.h"
00008 #include "Alignments.h"
00009
00010 using namespace std;
00011 using namespace Moses;
00012
00013 inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r")
00014 {
00015 std::string res = str;
00016 res.erase(str.find_last_not_of(dropChars)+1);
00017 return res.erase(0, res.find_first_not_of(dropChars));
00018 }
00019
00020 class CreateXMLRetValues
00021 {
00022 public:
00023 string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv;
00024 };
00025
00026 CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path );
00027
00028 void create_xml(const string &inPath)
00029 {
00030 ifstream inStrme(inPath.c_str());
00031 ofstream rule((inPath + ".extract").c_str());
00032 ofstream ruleInv((inPath + ".extract.inv").c_str());
00033
00034
00035
00036 string source, target, align, path;
00037 string *input = NULL;
00038 int count;
00039
00040 int lineCount = 1;
00041 int ruleCount = 1;
00042 string inLine;
00043
00044 int step = 0;
00045 while (!inStrme.eof()) {
00046 getline(inStrme, inLine);
00047
00048 switch (step) {
00049 case 0:
00050
00051 Scan<int>(inLine);
00052 ++step;
00053 break;
00054 case 1:
00055
00056 Scan<float>(inLine);
00057 ++step;
00058 break;
00059 case 2:
00060 source = inLine;
00061 ++step;
00062 break;
00063 case 3:
00064 if (input == NULL) {
00065 input = new string(inLine);
00066 } else {
00067 assert(inLine == *input);
00068 }
00069 ++step;
00070 break;
00071 case 4:
00072 target = inLine;
00073 ++step;
00074 break;
00075 case 5:
00076 align = inLine;
00077 ++step;
00078 break;
00079 case 6:
00080 path = inLine + "X";
00081 ++step;
00082 break;
00083 case 7:
00084 count = Scan<int>(inLine);
00085 CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path);
00086
00087
00088 rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
00089 << " ||| " << count << endl;
00090 ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
00091 << " ||| " << count << endl;
00092
00093
00094 ++ruleCount;
00095 step = 0;
00096 break;
00097 }
00098
00099 ++lineCount;
00100 }
00101
00102 delete input;
00103 ruleInv.close();
00104 rule.close();
00105 inStrme.close();
00106
00107 }
00108
00109
00110 CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path)
00111 {
00112 CreateXMLRetValues ret;
00113 vector<string> sourceToks = Tokenize(source, " ")
00114 ,inputToks = Tokenize(input, " ")
00115 ,targetsToks = Tokenize(target, " ");
00116 Alignments alignments(align, sourceToks.size(), targetsToks.size());
00117 map<int, string> frameInput;
00118 map<int, int> alignI2S;
00119 vector< map<string, int> > nonTerms;
00120 vector<bool> targetBitmap(targetsToks.size(), true);
00121 vector<bool> inputBitmap;
00122
00123
00124 int s = 0, i = 0;
00125 bool currently_matching = false;
00126 int start_s = 0, start_i = 0;
00127
00128
00129 for ( int p = 0 ; p < int(path.length()) ; p++ ) {
00130 string action = path.substr(p, 1);
00131
00132
00133 if ( currently_matching && action != "M" && action != "X" ) {
00134 start_i = i;
00135 start_s = s;
00136 currently_matching = 0;
00137 }
00138
00139 else if ( !currently_matching && ( action == "M" || action == "X" ) ) {
00140
00141
00142 for ( int ss = start_s ; ss < s ; ss++ ) {
00143 const std::map<int, int> &targets = alignments.m_alignS2T[ss];
00144
00145 std::map<int, int>::const_iterator iter;
00146 for (iter = targets.begin(); iter != targets.end(); ++iter) {
00147 int tt = iter->first;
00148 targetBitmap[tt] = 0;
00149 }
00150
00151
00152 }
00153
00154
00155
00156 if (start_i < i ) {
00157
00158
00159 string insertion = "";
00160 for (int ii = start_i ; ii < i ; ii++ ) {
00161 insertion += inputToks[ii] + " ";
00162 }
00163
00164
00165
00166
00167 int start_t = 1000;
00168 for ( int ss = start_s ; ss < s ; ss++ ) {
00169 const std::map<int, int> &targets = alignments.m_alignS2T[ss];
00170
00171 std::map<int, int>::const_iterator iter;
00172 for (iter = targets.begin(); iter != targets.end(); ++iter) {
00173 int tt = iter->first;
00174 if (tt < start_t) {
00175 start_t = tt;
00176 }
00177 }
00178 }
00179
00180
00181 if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) {
00182 start_t = targetsToks.size() - 1;
00183 }
00184
00185
00186 if ( start_t == 1000 ) {
00187 start_t = -1;
00188 for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
00189 const std::map<int, int> &targets = alignments.m_alignS2T[ss];
00190
00191 std::map<int, int>::const_iterator iter;
00192 for (iter = targets.begin(); iter != targets.end(); ++iter) {
00193 int tt = iter->first;
00194 if (tt > start_t) {
00195 start_t = tt;
00196 }
00197 }
00198 }
00199 }
00200
00201 frameInput[start_t] += insertion;
00202 map<string, int> nt;
00203 nt["start_t"] = start_t;
00204 nt["start_i"] = start_i;
00205 nonTerms.push_back(nt);
00206
00207 }
00208
00209 currently_matching = 1;
00210 }
00211
00212
00213
00214
00215
00216
00217
00218 if ( action != "I" ) {
00219
00220
00221 if (s < int(alignments.m_alignS2T.size())) {
00222 const std::map<int, int> &targets = alignments.m_alignS2T[s];
00223
00224
00225 std::map<int, int>::const_iterator iter;
00226 for (iter = targets.begin(); iter != targets.end(); ++iter) {
00227
00228
00229 }
00230 }
00231 }
00232
00233
00234 if (action != "I")
00235 s++;
00236 if (action != "D") {
00237 i++;
00238 alignI2S[i] = s;
00239 }
00240
00241 if (action == "M") {
00242 inputBitmap.push_back(1);
00243 } else if (action == "I" || action == "S") {
00244 inputBitmap.push_back(0);
00245 }
00246
00247 }
00248
00249
00250 for (size_t i = 0; i < targetBitmap.size(); ++i) {
00251
00252 }
00253
00254
00255 for (map<int, string>::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
00256
00257 }
00258
00259
00260
00261
00262 int rule_pos_s = 0;
00263 map<int, int> ruleAlignS;
00264
00265 for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) {
00266 if ( inputBitmap[i] ) {
00267 ret.ruleS += inputToks[i] + " ";
00268 ruleAlignS[ alignI2S[i] ] = rule_pos_s++;
00269 }
00270
00271 for (size_t j = 0; j < nonTerms.size(); ++j) {
00272 map<string, int> &nt = nonTerms[j];
00273 if (i == nt["start_i"]) {
00274 ret.ruleS += "[X][X] ";
00275 nt["rule_pos_s"] = rule_pos_s++;
00276 }
00277 }
00278 }
00279
00280 int rule_pos_t = 0;
00281 map<int, int> ruleAlignT;
00282
00283 for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) {
00284 if (t >= 0 && targetBitmap[t]) {
00285 ret.ruleT += targetsToks[t] + " ";
00286 ruleAlignT[t] = rule_pos_t++;
00287 }
00288
00289 for (size_t i = 0; i < nonTerms.size(); ++i) {
00290 map<string, int> &nt = nonTerms[i];
00291
00292 if (t == nt["start_t"]) {
00293 ret.ruleT += "[X][X] ";
00294 nt["rule_pos_t"] = rule_pos_t++;
00295 }
00296 }
00297 }
00298
00299 int numAlign = 0;
00300 ret.ruleAlignment = "";
00301
00302 for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) {
00303 int s = iter->first;
00304
00305 if (s < int(alignments.m_alignS2T.size())) {
00306 const std::map<int, int> &targets = alignments.m_alignS2T[s];
00307
00308 std::map<int, int>::const_iterator iter;
00309 for (iter = targets.begin(); iter != targets.end(); ++iter) {
00310 int t =iter->first;
00311 if (ruleAlignT.find(t) == ruleAlignT.end())
00312 continue;
00313 ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " ";
00314 ++numAlign;
00315 }
00316 }
00317 }
00318
00319
00320
00321 for (size_t i = 0; i < nonTerms.size(); ++i) {
00322 map<string, int> &nt = nonTerms[i];
00323 ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " ";
00324 ++numAlign;
00325 }
00326
00327
00328
00329 ret.ruleS = TrimInternal(ret.ruleS);
00330 ret.ruleT = TrimInternal(ret.ruleT);
00331 ret.ruleAlignment = TrimInternal(ret.ruleAlignment);
00332
00333 vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment);
00334 for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) {
00335 const string &alignPoint = ruleAlignmentToks[i];
00336 vector<string> toks = Tokenize(alignPoint, "-");
00337 assert(toks.size() == 2);
00338 ret.ruleAlignmentInv += toks[1] + "-" +toks[0];
00339 }
00340 ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv);
00341
00342
00343
00344 if (frameInput.find(-1) == frameInput.end())
00345 ret.frame = frameInput[-1];
00346
00347 int currently_included = 0;
00348 int start_t = -1;
00349 targetBitmap.push_back(0);
00350
00351 for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) {
00352
00353 if ( !currently_included && targetBitmap[t] ) {
00354 start_t = t;
00355 currently_included = 1;
00356 }
00357
00358 else if (currently_included
00359 && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
00360 ) {
00361
00362 if ( start_t >= 0 ) {
00363 string target = "";
00364
00365 for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
00366 target += targetsToks[tt] + " ";
00367 }
00368
00369 ret.frame += "<xml translation=\"" + target + "\"> x </xml> ";
00370 }
00371 currently_included = 0;
00372 }
00373
00374 if (frameInput.find(t) != frameInput.end())
00375 ret.frame += frameInput[t];
00376
00377
00378
00379 }
00380
00381 cerr << ret.frame << "\n-------------------------------------\n";
00382 return ret;
00383
00384 }
00385
00386
00387