00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "XmlOption.h"
00023 #include <vector>
00024 #include <string>
00025 #include <iostream>
00026 #include <boost/algorithm/string/predicate.hpp>
00027 #include <boost/foreach.hpp>
00028 #include <boost/unordered_map.hpp>
00029 #include "Util.h"
00030 #include "StaticData.h"
00031 #include "Range.h"
00032 #include "TargetPhrase.h"
00033 #include "ReorderingConstraint.h"
00034 #include "FactorCollection.h"
00035 #include "moses/TranslationModel/PhraseDictionary.h"
00036 #if PT_UG
00037 #include "TranslationModel/UG/mmsapt.h"
00038 #endif
00039
00040 namespace Moses
00041 {
00042 using namespace std;
00043 using namespace boost::algorithm;
00044
00045 string ParseXmlTagAttribute(const string& tag,const string& attributeName)
00046 {
00047
00048 string tagOpen = attributeName + "=\"";
00049 size_t contentsStart = tag.find(tagOpen);
00050 if (contentsStart == string::npos) return "";
00051 contentsStart += tagOpen.size();
00052 size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
00053 if (contentsEnd == string::npos) {
00054 TRACE_ERR("Malformed XML attribute: "<< tag);
00055 return "";
00056 }
00057 size_t possibleEnd;
00058 while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
00059 contentsEnd = possibleEnd;
00060 }
00061 return tag.substr(contentsStart,contentsEnd-contentsStart);
00062 }
00063
00071 string TrimXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00072 {
00073
00074 if (str.size() < lbrackStr.length()+rbrackStr.length() ) return str;
00075
00076
00077 if (starts_with(str, lbrackStr) && ends_with(str, rbrackStr)) {
00078 return str.substr(lbrackStr.length(), str.size()-lbrackStr.length()-rbrackStr.length());
00079 }
00080
00081 else {
00082 return str;
00083 }
00084 }
00085
00093 bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr)
00094 {
00095 return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
00096 (tag[lbrackStr.length()] == '/' ||
00097 (tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') ||
00098 (tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z')));
00099 }
00100
00111 vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00112 {
00113 string lbrack = lbrackStr;
00114 string rbrack = rbrackStr;
00115 vector<string> tokens;
00116 string::size_type cpos = 0;
00117 string::size_type lpos = 0;
00118 string::size_type rpos = 0;
00119
00120
00121 while (cpos != str.size()) {
00122
00123 lpos = str.find(lbrack, cpos);
00124 if (lpos != string::npos) {
00125
00126 rpos = str.find(rbrack, lpos+lbrackStr.length()-1);
00127
00128 if (rpos == string::npos) {
00129 TRACE_ERR("ERROR: malformed XML: " << str << endl);
00130 return tokens;
00131 }
00132 } else {
00133
00134 tokens.push_back(str.substr(cpos));
00135 break;
00136 }
00137
00138
00139 if (lpos - cpos > 0)
00140 tokens.push_back(str.substr(cpos, lpos - cpos));
00141
00142
00143 tokens.push_back(str.substr(lpos, rpos-lpos+rbrackStr.length()));
00144 cpos = rpos + rbrackStr.length();
00145 }
00146 return tokens;
00147 }
00148
00161 bool
00162 ProcessAndStripXMLTags(AllOptions const& opts, string &line,
00163 vector<XmlOption const*> &res,
00164 ReorderingConstraint &reorderingConstraint,
00165 vector< size_t > &walls,
00166 std::vector< std::pair<size_t, std::string> > &placeholders,
00167 InputType &input)
00168 {
00169
00170
00171 const std::string& lbrackStr = opts.input.xml_brackets.first;
00172 const std::string& rbrackStr = opts.input.xml_brackets.second;
00173 int offset = is_syntax(opts.search.algo) ? 1 : 0;
00174
00175
00176
00177
00178 PhraseDictionary *firstPt = NULL;
00179 if (PhraseDictionary::GetColl().size() == 0) {
00180 firstPt = PhraseDictionary::GetColl()[0];
00181 }
00182
00183
00184 if (line.find(lbrackStr) == string::npos) {
00185 return true;
00186 }
00187
00188
00189
00190 vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);
00191
00192
00193
00194 typedef pair< string, pair< size_t, string > > OpenedTag;
00195 vector< OpenedTag > tagStack;
00196
00197 string cleanLine;
00198 size_t wordPos = 0;
00199
00200 const vector<FactorType> &outputFactorOrder = opts.output.factor_order;
00201
00202
00203 for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00204
00205 if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
00206
00207 if (cleanLine.size()>0 &&
00208 cleanLine[cleanLine.size() - 1] != ' ' &&
00209 xmlTokens[xmlTokenPos][0] != ' ') {
00210 cleanLine += " ";
00211 }
00212 cleanLine += xmlTokens[xmlTokenPos];
00213 wordPos = Tokenize(cleanLine).size();
00214 }
00215
00216
00217 else {
00218
00219
00220
00221 string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
00222 VERBOSE(3,"XML TAG IS: " << tag << std::endl);
00223
00224 if (tag.size() == 0) {
00225 TRACE_ERR("ERROR: empty tag name: " << line << endl);
00226 return false;
00227 }
00228
00229
00230 bool isUnary = ( tag[tag.size() - 1] == '/' );
00231
00232
00233 bool isClosed = ( tag[0] == '/' );
00234 bool isOpen = !isClosed;
00235
00236 if (isClosed && isUnary) {
00237 TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
00238 return false;
00239 }
00240
00241 if (isClosed)
00242 tag = tag.substr(1);
00243 if (isUnary)
00244 tag = tag.substr(0,tag.size()-1);
00245
00246
00247 string::size_type endOfName = tag.find_first_of(' ');
00248 string tagName = tag;
00249 string tagContent = "";
00250 if (endOfName != string::npos) {
00251 tagName = tag.substr(0,endOfName);
00252 tagContent = tag.substr(endOfName+1);
00253 }
00254
00255
00256
00257 if (isOpen || isUnary) {
00258
00259 OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00260 tagStack.push_back( openedTag );
00261 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
00262 }
00263
00264
00265
00266 if (isClosed || isUnary) {
00267
00268 if (tagStack.size() == 0) {
00269 TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
00270 return false;
00271 }
00272 OpenedTag openedTag = tagStack.back();
00273 tagStack.pop_back();
00274
00275
00276 if (openedTag.first != tagName) {
00277 TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
00278 return false;
00279 }
00280
00281
00282 size_t startPos = openedTag.second.first;
00283 string tagContent = openedTag.second.second;
00284 size_t endPos = wordPos;
00285
00286
00287 string span = ParseXmlTagAttribute(tagContent,"span");
00288 if (! span.empty()) {
00289 vector<string> ij = Tokenize(span, "-");
00290 if (ij.size() != 1 && ij.size() != 2) {
00291 TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
00292 return false;
00293 }
00294 startPos = atoi(ij[0].c_str());
00295 if (ij.size() == 1) endPos = startPos + 1;
00296 else endPos = atoi(ij[1].c_str()) + 1;
00297 }
00298
00299 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
00300
00301
00302 if (tagName == "wall") {
00303 size_t start = (startPos == 0) ? 0 : startPos-1;
00304 for(size_t pos = start; pos < endPos; pos++)
00305 walls.push_back( pos );
00306 }
00307
00308
00309 else if (tagName == "zone") {
00310 if (startPos >= endPos) {
00311 TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
00312 return false;
00313 }
00314 reorderingConstraint.SetZone( startPos, endPos-1 );
00315 }
00316
00317
00318 else if (tagName == "ne") {
00319 if (startPos != (endPos - 1)) {
00320 TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
00321 return false;
00322 }
00323 string entity = ParseXmlTagAttribute(tagContent,"entity");
00324 placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
00325 }
00326
00327
00328 else if (tagName == "update") {
00329 #if PT_UG
00330
00331 string pdName = ParseXmlTagAttribute(tagContent,"name");
00332 string source = ParseXmlTagAttribute(tagContent,"source");
00333 string target = ParseXmlTagAttribute(tagContent,"target");
00334 string alignment = ParseXmlTagAttribute(tagContent,"alignment");
00335
00336 const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
00337 PhraseDictionary* pd = NULL;
00338 for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
00339 PhraseDictionary* curPd = *i;
00340 if (curPd->GetScoreProducerDescription() == pdName) {
00341 pd = curPd;
00342 break;
00343 }
00344 }
00345 if (pd == NULL) {
00346 TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
00347 return false;
00348 }
00349
00350 VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
00351 Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
00352 pdsa->add(source, target, alignment);
00353 #else
00354 TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
00355 return false;
00356 #endif
00357 }
00358
00359
00360
00361
00362
00363
00364 else if (tagName == "weight-overwrite") {
00365
00366
00367 const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
00368 boost::unordered_map<string, FeatureFunction*> map;
00369 BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
00370 map[ff->GetScoreProducerDescription()] = ff;
00371 }
00372
00373
00374 ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
00375 boost::unordered_map<string, FeatureFunction*>::iterator ffi;
00376 string ffName("");
00377 vector<float> ffWeights;
00378 vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
00379 BOOST_FOREACH(string const& tok, toks) {
00380 if (ends_with(tok, "=")) {
00381
00382 if (ffName != "") {
00383
00384 if (ffi != map.end()) {
00385 allWeights.Assign(ffi->second, ffWeights);
00386 }
00387 ffWeights.clear();
00388 }
00389 ffName = tok.substr(0, tok.size() - 1);
00390 ffi = map.find(ffName);
00391 if (ffi == map.end()) {
00392 TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
00393 }
00394 } else {
00395
00396 ffWeights.push_back(Scan<float>(tok));
00397 }
00398 }
00399 if (ffi != map.end()) {
00400 allWeights.Assign(ffi->second, ffWeights);
00401 }
00402 StaticData::InstanceNonConst().SetAllWeights(allWeights);
00403 }
00404
00405
00406
00407
00408
00409
00410 else if (tagName == "coord") {
00411
00412 string space = ParseXmlTagAttribute(tagContent, "space");
00413 vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
00414 size_t id = StaticData::Instance().GetCoordSpace(space);
00415 if (!id) {
00416 TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
00417 } else {
00418
00419 if (!input.m_coordMap) {
00420 input.m_coordMap.reset(new map<size_t const, vector<float> >);
00421 }
00422 vector<float>& coord = (*input.m_coordMap)[id];
00423 Scan<float>(coord, tok);
00424 }
00425 }
00426
00427
00428 else {
00429 if (startPos > endPos) {
00430 TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
00431 return false;
00432 } else if (startPos == endPos) {
00433 TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
00434 continue;
00435 }
00436
00437
00438
00439 vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
00440 if( altTexts.size() == 1 && altTexts[0] == "" )
00441 altTexts.pop_back();
00442
00443 vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
00444 if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
00445 for(vector<string>::iterator translation=moreAltTexts.begin();
00446 translation != moreAltTexts.end();
00447 translation++) {
00448 string t = *translation;
00449 altTexts.push_back( t );
00450 }
00451 }
00452
00453
00454 vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
00455 if( altProbs.size() == 1 && altProbs[0] == "" )
00456 altProbs.pop_back();
00457
00458
00459 VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
00460 VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
00461 VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
00462 VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
00463 if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
00464 TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
00465 return false;
00466 }
00467
00468
00469 if (opts.input.xml_policy != XmlIgnore) {
00470
00471 for (size_t i=0; i<altTexts.size(); ++i) {
00472 Phrase sourcePhrase;
00473
00474
00475 float probValue = 1;
00476 if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
00477
00478 float scoreValue = FloorScore(TransformScore(probValue));
00479
00480 Range range(startPos + offset,endPos-1 + offset);
00481 TargetPhrase targetPhrase(firstPt);
00482
00483
00484
00485
00486
00487
00488 vector<FactorType> fakeOutputFactorOrder;
00489
00490 size_t factorsInAltText = TokenizeMultiCharSeparator(Tokenize(altTexts[i])[0], StaticData::Instance().GetFactorDelimiter()).size();
00491 for (size_t f = 0; f < factorsInAltText; ++f) {
00492 fakeOutputFactorOrder.push_back(f);
00493 }
00494 targetPhrase.CreateFromString(Output, fakeOutputFactorOrder, altTexts[i], NULL);
00495
00496
00497 const UnknownLHSList &lhsList = opts.syntax.unknown_lhs;
00498 if (!lhsList.empty()) {
00499 const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true);
00500 Word *targetLHS = new Word(true);
00501 targetLHS->SetFactor(0, factor);
00502 targetPhrase.SetTargetLHS(targetLHS);
00503 }
00504
00505 targetPhrase.SetXMLScore(scoreValue);
00506 targetPhrase.EvaluateInIsolation(sourcePhrase);
00507
00508 XmlOption *option = new XmlOption(range,targetPhrase);
00509 assert(option);
00510
00511 res.push_back(option);
00512 }
00513 altTexts.clear();
00514 altProbs.clear();
00515 }
00516 }
00517 }
00518 }
00519 }
00520
00521 if (tagStack.size() > 0) {
00522 TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
00523 return false;
00524 }
00525
00526
00527 line = cleanLine;
00528 return true;
00529 }
00530
00531 }