Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/XmlOption.cpp Source File

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 // vim:tabstop=2
00003 /***********************************************************************
00004   Moses - factored phrase-based language decoder
00005   Copyright (C) 2006 University of Edinburgh
00006 
00007   This library is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU Lesser General Public
00009   License as published by the Free Software Foundation; either
00010   version 2.1 of the License, or (at your option) any later version.
00011 
00012   This library is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015   Lesser General Public License for more details.
00016 
00017   You should have received a copy of the GNU Lesser General Public
00018   License along with this library; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00020  ***********************************************************************/
00021 
00022 #include "XmlOption.h"
00023 #include <vector>
00024 #include <string>
00025 #include <iostream>
00026 #include <boost/algorithm/string/predicate.hpp>
00027 #include <boost/foreach.hpp>
00028 #include <boost/unordered_map.hpp>
00029 #include "Util.h"
00030 #include "StaticData.h"
00031 #include "Range.h"
00032 #include "TargetPhrase.h"
00033 #include "ReorderingConstraint.h"
00034 #include "FactorCollection.h"
00035 #include "moses/TranslationModel/PhraseDictionary.h"
00036 #if PT_UG
00037 #include "TranslationModel/UG/mmsapt.h"
00038 #endif
00039 
00040 namespace Moses
00041 {
00042 using namespace std;
00043 using namespace boost::algorithm;
00044 
00045 string ParseXmlTagAttribute(const string& tag,const string& attributeName)
00046 {
00047   /*TODO deal with unescaping \"*/
00048   string tagOpen = attributeName + "=\"";
00049   size_t contentsStart = tag.find(tagOpen);
00050   if (contentsStart == string::npos) return "";
00051   contentsStart += tagOpen.size();
00052   size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
00053   if (contentsEnd == string::npos) {
00054     TRACE_ERR("Malformed XML attribute: "<< tag);
00055     return "";
00056   }
00057   size_t possibleEnd;
00058   while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
00059     contentsEnd = possibleEnd;
00060   }
00061   return tag.substr(contentsStart,contentsEnd-contentsStart);
00062 }
00063 
00071 string TrimXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00072 {
00073   // too short to be xml token -> do nothing
00074   if (str.size() < lbrackStr.length()+rbrackStr.length() ) return str;
00075 
00076   // strip first and last character
00077   if (starts_with(str, lbrackStr) && ends_with(str, rbrackStr)) {
00078     return str.substr(lbrackStr.length(), str.size()-lbrackStr.length()-rbrackStr.length());
00079   }
00080   // not an xml token -> do nothing
00081   else {
00082     return str;
00083   }
00084 }
00085 
00093 bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr)
00094 {
00095   return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
00096           (tag[lbrackStr.length()] == '/' ||
00097            (tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') ||
00098            (tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z')));
00099 }
00100 
00111 vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00112 {
00113   string lbrack = lbrackStr; // = "<";
00114   string rbrack = rbrackStr; // = ">";
00115   vector<string> tokens; // vector of tokens to be returned
00116   string::size_type cpos = 0; // current position in string
00117   string::size_type lpos = 0; // left start of xml tag
00118   string::size_type rpos = 0; // right end of xml tag
00119 
00120   // walk thorugh the string (loop vver cpos)
00121   while (cpos != str.size()) {
00122     // find the next opening "<" of an xml tag
00123     lpos = str.find(lbrack, cpos);                      // lpos = str.find_first_of(lbrack, cpos);
00124     if (lpos != string::npos) {
00125       // find the end of the xml tag
00126       rpos = str.find(rbrack, lpos+lbrackStr.length()-1);                       // rpos = str.find_first_of(rbrack, lpos);
00127       // sanity check: there has to be closing ">"
00128       if (rpos == string::npos) {
00129         TRACE_ERR("ERROR: malformed XML: " << str << endl);
00130         return tokens;
00131       }
00132     } else { // no more tags found
00133       // add the rest as token
00134       tokens.push_back(str.substr(cpos));
00135       break;
00136     }
00137 
00138     // add stuff before xml tag as token, if there is any
00139     if (lpos - cpos > 0)
00140       tokens.push_back(str.substr(cpos, lpos - cpos));
00141 
00142     // add xml tag as token
00143     tokens.push_back(str.substr(lpos, rpos-lpos+rbrackStr.length()));
00144     cpos = rpos + rbrackStr.length();
00145   }
00146   return tokens;
00147 }
00148 
00161 bool
00162 ProcessAndStripXMLTags(AllOptions const& opts, string &line,
00163                        vector<XmlOption const*> &res,
00164                        ReorderingConstraint &reorderingConstraint,
00165                        vector< size_t > &walls,
00166                        std::vector< std::pair<size_t, std::string> > &placeholders,
00167                        InputType &input)
00168 {
00169   //parse XML markup in translation line
00170 
00171   const std::string& lbrackStr = opts.input.xml_brackets.first;
00172   const std::string& rbrackStr = opts.input.xml_brackets.second;
00173   int offset = is_syntax(opts.search.algo) ? 1 : 0;
00174 
00175   // const StaticData &staticData = StaticData::Instance();
00176 
00177   // hack. What pt should XML trans opt be assigned to?
00178   PhraseDictionary *firstPt = NULL;
00179   if (PhraseDictionary::GetColl().size() == 0) {
00180     firstPt = PhraseDictionary::GetColl()[0];
00181   }
00182 
00183   // no xml tag? we're done.
00184   if (line.find(lbrackStr) == string::npos) {
00185     return true;
00186   }
00187 
00188   // break up input into a vector of xml tags and text
00189   // example: (this), (<b>), (is a), (</b>), (test .)
00190   vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);
00191 
00192   // we need to store opened tags, until they are closed
00193   // tags are stored as tripled (tagname, startpos, contents)
00194   typedef pair< string, pair< size_t, string > > OpenedTag;
00195   vector< OpenedTag > tagStack; // stack that contains active opened tags
00196 
00197   string cleanLine; // return string (text without xml)
00198   size_t wordPos = 0; // position in sentence (in terms of number of words)
00199 
00200   const vector<FactorType> &outputFactorOrder = opts.output.factor_order;
00201 
00202   // loop through the tokens
00203   for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00204     // not a xml tag, but regular text (may contain many words)
00205     if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
00206       // add a space at boundary, if necessary
00207       if (cleanLine.size()>0 &&
00208           cleanLine[cleanLine.size() - 1] != ' ' &&
00209           xmlTokens[xmlTokenPos][0] != ' ') {
00210         cleanLine += " ";
00211       }
00212       cleanLine += xmlTokens[xmlTokenPos]; // add to output
00213       wordPos = Tokenize(cleanLine).size(); // count all the words
00214     }
00215 
00216     // process xml tag
00217     else {
00218       // *** get essential information about tag ***
00219 
00220       // strip extra boundary spaces and "<" and ">"
00221       string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
00222       VERBOSE(3,"XML TAG IS: " << tag << std::endl);
00223 
00224       if (tag.size() == 0) {
00225         TRACE_ERR("ERROR: empty tag name: " << line << endl);
00226         return false;
00227       }
00228 
00229       // check if unary (e.g., "<wall/>")
00230       bool isUnary = ( tag[tag.size() - 1] == '/' );
00231 
00232       // check if opening tag (e.g. "<a>", not "</a>")g
00233       bool isClosed = ( tag[0] == '/' );
00234       bool isOpen = !isClosed;
00235 
00236       if (isClosed && isUnary) {
00237         TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
00238         return false;
00239       }
00240 
00241       if (isClosed)
00242         tag = tag.substr(1); // remove "/" at the beginning
00243       if (isUnary)
00244         tag = tag.substr(0,tag.size()-1); // remove "/" at the end
00245 
00246       // find the tag name and contents
00247       string::size_type endOfName = tag.find_first_of(' ');
00248       string tagName = tag;
00249       string tagContent = "";
00250       if (endOfName != string::npos) {
00251         tagName = tag.substr(0,endOfName);
00252         tagContent = tag.substr(endOfName+1);
00253       }
00254 
00255       // *** process new tag ***
00256 
00257       if (isOpen || isUnary) {
00258         // put the tag on the tag stack
00259         OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00260         tagStack.push_back( openedTag );
00261         VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
00262       }
00263 
00264       // *** process completed tag ***
00265 
00266       if (isClosed || isUnary) {
00267         // pop last opened tag from stack;
00268         if (tagStack.size() == 0) {
00269           TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
00270           return false;
00271         }
00272         OpenedTag openedTag = tagStack.back();
00273         tagStack.pop_back();
00274 
00275         // tag names have to match
00276         if (openedTag.first != tagName) {
00277           TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
00278           return false;
00279         }
00280 
00281         // assemble remaining information about tag
00282         size_t startPos = openedTag.second.first;
00283         string tagContent = openedTag.second.second;
00284         size_t endPos = wordPos;
00285 
00286         // span attribute overwrites position
00287         string span = ParseXmlTagAttribute(tagContent,"span");
00288         if (! span.empty()) {
00289           vector<string> ij = Tokenize(span, "-");
00290           if (ij.size() != 1 && ij.size() != 2) {
00291             TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
00292             return false;
00293           }
00294           startPos = atoi(ij[0].c_str());
00295           if (ij.size() == 1) endPos = startPos + 1;
00296           else endPos = atoi(ij[1].c_str()) + 1;
00297         }
00298 
00299         VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
00300 
00301         // special tag: wall
00302         if (tagName == "wall") {
00303           size_t start = (startPos == 0) ? 0 : startPos-1;
00304           for(size_t pos = start; pos < endPos; pos++)
00305             walls.push_back( pos );
00306         }
00307 
00308         // special tag: zone
00309         else if (tagName == "zone") {
00310           if (startPos >= endPos) {
00311             TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
00312             return false;
00313           }
00314           reorderingConstraint.SetZone( startPos, endPos-1 );
00315         }
00316 
00317         // name-entity placeholder
00318         else if (tagName == "ne") {
00319           if (startPos != (endPos - 1)) {
00320             TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
00321             return false;
00322           }
00323           string entity = ParseXmlTagAttribute(tagContent,"entity");
00324           placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
00325         }
00326 
00327         // update: add new aligned sentence pair to Mmsapt identified by name
00328         else if (tagName == "update") {
00329 #if PT_UG
00330           // get model name and aligned sentence pair
00331           string pdName = ParseXmlTagAttribute(tagContent,"name");
00332           string source = ParseXmlTagAttribute(tagContent,"source");
00333           string target = ParseXmlTagAttribute(tagContent,"target");
00334           string alignment = ParseXmlTagAttribute(tagContent,"alignment");
00335           // find PhraseDictionary by name
00336           const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
00337           PhraseDictionary* pd = NULL;
00338           for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
00339             PhraseDictionary* curPd = *i;
00340             if (curPd->GetScoreProducerDescription() == pdName) {
00341               pd = curPd;
00342               break;
00343             }
00344           }
00345           if (pd == NULL) {
00346             TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
00347             return false;
00348           }
00349           // update model
00350           VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
00351           Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
00352           pdsa->add(source, target, alignment);
00353 #else
00354           TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
00355           return false;
00356 #endif
00357         }
00358 
00359         // weight-overwrite: update feature weights, unspecified weights remain unchanged
00360         // IMPORTANT: translation models that cache phrases or apply table-limit during load
00361         // based on initial weights need to be reset.  Sending an empty update will do this
00362         // for PhraseDictionaryBitextSampling (Mmsapt) models:
00363         // <update name="TranslationModelName" source=" " target=" " alignment=" " />
00364         else if (tagName == "weight-overwrite") {
00365 
00366           // is a name->ff map stored anywhere so we don't have to build it every time?
00367           const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
00368           boost::unordered_map<string, FeatureFunction*> map;
00369           BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
00370             map[ff->GetScoreProducerDescription()] = ff;
00371           }
00372 
00373           // update each weight listed
00374           ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
00375           boost::unordered_map<string, FeatureFunction*>::iterator ffi;
00376           string ffName("");
00377           vector<float> ffWeights;
00378           vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
00379           BOOST_FOREACH(string const& tok, toks) {
00380             if (ends_with(tok, "=")) {
00381               // start new feature
00382               if (ffName != "") {
00383                 // set previous feature weights
00384                 if (ffi != map.end()) {
00385                   allWeights.Assign(ffi->second, ffWeights);
00386                 }
00387                 ffWeights.clear();
00388               }
00389               ffName = tok.substr(0, tok.size() - 1);
00390               ffi = map.find(ffName);
00391               if (ffi == map.end()) {
00392                 TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
00393               }
00394             } else {
00395               // weight for current feature
00396               ffWeights.push_back(Scan<float>(tok));
00397             }
00398           }
00399           if (ffi != map.end()) {
00400             allWeights.Assign(ffi->second, ffWeights);
00401           }
00402           StaticData::InstanceNonConst().SetAllWeights(allWeights);
00403         }
00404 
00405         // Coord: coordinates of the input sentence in a user-defined space
00406         // <coord space="NAME" coord="X Y Z ..." />
00407         // where NAME is the name of the space and X Y Z ... are floats.  See
00408         // PhraseDistanceFeature for an example of using this information for
00409         // feature scoring.
00410         else if (tagName == "coord") {
00411           // Parse tag
00412           string space = ParseXmlTagAttribute(tagContent, "space");
00413           vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
00414           size_t id = StaticData::Instance().GetCoordSpace(space);
00415           if (!id) {
00416             TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
00417           } else {
00418             // Init if needed
00419             if (!input.m_coordMap) {
00420               input.m_coordMap.reset(new map<size_t const, vector<float> >);
00421             }
00422             vector<float>& coord = (*input.m_coordMap)[id];
00423             Scan<float>(coord, tok);
00424           }
00425         }
00426 
00427         // default: opening tag that specifies translation options
00428         else {
00429           if (startPos > endPos) {
00430             TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
00431             return false;
00432           } else if (startPos == endPos) {
00433             TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
00434             continue;
00435           }
00436 
00437           // specified translations -> vector of phrases
00438           // multiple translations may be specified, separated by "||"
00439           vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
00440           if( altTexts.size() == 1 && altTexts[0] == "" )
00441             altTexts.pop_back(); // happens when nothing specified
00442           // deal with legacy annotations: "translation" was called "english"
00443           vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
00444           if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
00445             for(vector<string>::iterator translation=moreAltTexts.begin();
00446                 translation != moreAltTexts.end();
00447                 translation++) {
00448               string t = *translation;
00449               altTexts.push_back( t );
00450             }
00451           }
00452 
00453           // specified probabilities for the translations -> vector of probs
00454           vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
00455           if( altProbs.size() == 1 && altProbs[0] == "" )
00456             altProbs.pop_back(); // happens when nothing specified
00457 
00458           // report what we have processed so far
00459           VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
00460           VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
00461           VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
00462           VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
00463           if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
00464             TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
00465             return false;
00466           }
00467 
00468           // store translation options into members
00469           if (opts.input.xml_policy != XmlIgnore) {
00470             // only store options if we aren't ignoring them
00471             for (size_t i=0; i<altTexts.size(); ++i) {
00472               Phrase sourcePhrase; // TODO don't know what the source phrase is
00473 
00474               // set default probability
00475               float probValue = 1;
00476               if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
00477               // convert from prob to log-prob
00478               float scoreValue = FloorScore(TransformScore(probValue));
00479 
00480               Range range(startPos + offset,endPos-1 + offset); // span covered by phrase
00481               TargetPhrase targetPhrase(firstPt);
00482               // Target factors may be used by intermediate models (example: a
00483               // generation model produces a factor used by a class-based LM
00484               // but NOT output.  Fake the output factor order to match the
00485               // number of factors specified in the alt text.  A one-factor
00486               // system would have "word", a two-factor system would have
00487               // "word|class", and so on.
00488               vector<FactorType> fakeOutputFactorOrder;
00489               // Factors in first word of alt text
00490               size_t factorsInAltText = TokenizeMultiCharSeparator(Tokenize(altTexts[i])[0], StaticData::Instance().GetFactorDelimiter()).size();
00491               for (size_t f = 0; f < factorsInAltText; ++f) {
00492                 fakeOutputFactorOrder.push_back(f);
00493               }
00494               targetPhrase.CreateFromString(Output, fakeOutputFactorOrder, altTexts[i], NULL);
00495 
00496               // lhs
00497               const UnknownLHSList &lhsList = opts.syntax.unknown_lhs; // staticData.GetUnknownLHS();
00498               if (!lhsList.empty()) {
00499                 const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true);
00500                 Word *targetLHS = new Word(true);
00501                 targetLHS->SetFactor(0, factor); // TODO - other factors too?
00502                 targetPhrase.SetTargetLHS(targetLHS);
00503               }
00504 
00505               targetPhrase.SetXMLScore(scoreValue);
00506               targetPhrase.EvaluateInIsolation(sourcePhrase);
00507 
00508               XmlOption *option = new XmlOption(range,targetPhrase);
00509               assert(option);
00510 
00511               res.push_back(option);
00512             }
00513             altTexts.clear();
00514             altProbs.clear();
00515           }
00516         }
00517       }
00518     }
00519   }
00520   // we are done. check if there are tags that are still open
00521   if (tagStack.size() > 0) {
00522     TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
00523     return false;
00524   }
00525 
00526   // return de-xml'ed sentence in line
00527   line = cleanLine;
00528   return true;
00529 }
00530 
00531 }