00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <cassert>
00021 #include <vector>
00022 #include <string>
00023 #include <set>
00024 #include <iostream>
00025 #include <cstdlib>
00026 #include <sstream>
00027
00028 #include "SyntaxNodeCollection.h"
00029 #include "XmlException.h"
00030
00031 using namespace std;
00032
00033 namespace MosesTraining
00034 {
00035
00036 inline std::vector<std::string> Tokenize(const std::string& str,
00037 const std::string& delimiters = " \t")
00038 {
00039 std::vector<std::string> tokens;
00040
00041 std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
00042
00043 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00044
00045 while (std::string::npos != pos || std::string::npos != lastPos) {
00046
00047 tokens.push_back(str.substr(lastPos, pos - lastPos));
00048
00049 lastPos = str.find_first_not_of(delimiters, pos);
00050
00051 pos = str.find_first_of(delimiters, lastPos);
00052 }
00053
00054 return tokens;
00055 }
00056
00057 std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
00058 {
00059 std::string res = str;
00060 res.erase(str.find_last_not_of(dropChars)+1);
00061 return res.erase(0, res.find_first_not_of(dropChars));
00062 }
00063
00064 string ParseXmlTagAttribute(const string& tag,const string& attributeName)
00065 {
00066
00067 string tagOpen = attributeName + "=\"";
00068 size_t contentsStart = tag.find(tagOpen);
00069 if (contentsStart == string::npos) return "";
00070 contentsStart += tagOpen.size();
00071 size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
00072 if (contentsEnd == string::npos) {
00073 cerr << "Malformed XML attribute: "<< tag;
00074 return "";
00075 }
00076 size_t possibleEnd;
00077 while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
00078 contentsEnd = possibleEnd;
00079 }
00080 return tag.substr(contentsStart,contentsEnd-contentsStart);
00081 }
00082
00083
00084
00085 void ParseXmlTagAttributes(const std::string &s,
00086 std::map<std::string, std::string> &attributes)
00087 {
00088 std::size_t begin = 0;
00089 while (true) {
00090 std::size_t pos = s.find('=', begin);
00091 if (pos == std::string::npos) {
00092 return;
00093 }
00094 std::string name = Trim(s.substr(begin, pos-begin));
00095 begin = s.find('"', pos+1);
00096 if (begin == std::string::npos) {
00097 throw XmlException("invalid tag content");
00098 }
00099 pos = s.find('"', begin+1);
00100 if (pos == std::string::npos) {
00101 throw XmlException("invalid tag content");
00102 }
00103 while (s[pos-1] == '\\') {
00104 pos = s.find('"', pos+1);
00105 if (pos == std::string::npos) {
00106 throw XmlException("invalid tag content");
00107 }
00108 }
00109 if (name != "label" && name != "span") {
00110 attributes[name] = s.substr(begin+1, pos-begin-1);
00111 }
00112 begin = pos+1;
00113 }
00114 }
00115
00121 string TrimXml(const string& str)
00122 {
00123
00124 if (str.size() < 2) return str;
00125
00126
00127 if (str[0] == '<' && str[str.size() - 1] == '>') {
00128 return str.substr(1, str.size() - 2);
00129 }
00130
00131 else {
00132 return str;
00133 }
00134 }
00135
00141 bool isXmlTag(const string& tag)
00142 {
00143 return tag[0] == '<';
00144 }
00145
00149 string unescape(const string& str)
00150 {
00151 string s;
00152 s.reserve(str.size());
00153 string::size_type n;
00154 string::size_type start = 0;
00155 while ((n = str.find('&', start)) != string::npos) {
00156 s += str.substr(start, n-start);
00157 string::size_type end = str.find(';', n);
00158 assert(n != string::npos);
00159 string name = str.substr(n+1, end-n-1);
00160 if (name == "lt") {
00161 s += string("<");
00162 } else if (name == "gt") {
00163 s += string(">");
00164 } else if (name == "#91") {
00165 s += string("[");
00166 } else if (name == "#93") {
00167 s += string("]");
00168 } else if (name == "bra") {
00169 s += string("[");
00170 } else if (name == "ket") {
00171 s += string("]");
00172 } else if (name == "bar" || name == "#124") {
00173 s += string("|");
00174 } else if (name == "amp") {
00175 s += string("&");
00176 } else if (name == "apos") {
00177 s += string("'");
00178 } else if (name == "quot") {
00179 s += string("\"");
00180 } else {
00181
00182
00183
00184
00185
00186
00187
00188 std::ostringstream msg;
00189 msg << "unsupported XML escape sequence: &" << name << ";";
00190 throw XmlException(msg.str());
00191 }
00192 if (end == str.size()-1) {
00193 return s;
00194 }
00195 start = end + 1;
00196 }
00197 s += str.substr(start);
00198 return s;
00199 }
00200
00209 vector<string> TokenizeXml(const string& str)
00210 {
00211 string lbrack = "<";
00212 string rbrack = ">";
00213 vector<string> tokens;
00214 string::size_type cpos = 0;
00215 string::size_type lpos = 0;
00216 string::size_type rpos = 0;
00217
00218
00219 while (cpos != str.size()) {
00220
00221 lpos = str.find_first_of(lbrack, cpos);
00222 if (lpos != string::npos) {
00223
00224 rpos = str.find_first_of(rbrack, lpos);
00225
00226 if (rpos == string::npos) {
00227 cerr << "ERROR: malformed XML: " << str << endl;
00228 return tokens;
00229 }
00230 } else {
00231
00232 tokens.push_back(str.substr(cpos));
00233 break;
00234 }
00235
00236
00237 if (lpos - cpos > 0)
00238 tokens.push_back(str.substr(cpos, lpos - cpos));
00239
00240
00241 tokens.push_back(str.substr(lpos, rpos-lpos+1));
00242 cpos = rpos + 1;
00243 }
00244 return tokens;
00245 }
00246
00259 bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
00260 set< string > &labelCollection,
00261 map< string, int > &topLabelCollection,
00262 bool unescapeSpecialChars )
00263 {
00264
00265
00266
00267 if (line.find_first_of('<') == string::npos) {
00268 return true;
00269 }
00270
00271
00272
00273 vector<string> xmlTokens = TokenizeXml(line);
00274
00275
00276
00277 typedef pair< string, pair< size_t, string > > OpenedTag;
00278 vector< OpenedTag > tagStack;
00279
00280 string cleanLine;
00281 size_t wordPos = 0;
00282
00283
00284 for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00285
00286 if(!isXmlTag(xmlTokens[xmlTokenPos])) {
00287
00288 if (cleanLine.size()>0 &&
00289 cleanLine[cleanLine.size() - 1] != ' ' &&
00290 xmlTokens[xmlTokenPos][0] != ' ') {
00291 cleanLine += " ";
00292 }
00293
00294 if (unescapeSpecialChars) {
00295 cleanLine += unescape(xmlTokens[xmlTokenPos]);
00296 } else {
00297 cleanLine += xmlTokens[xmlTokenPos];
00298 }
00299 wordPos = Tokenize(cleanLine).size();
00300 }
00301
00302
00303 else {
00304
00305
00306
00307 string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
00308
00309
00310 if (tag.size() == 0) {
00311 cerr << "ERROR: empty tag name: " << line << endl;
00312 return false;
00313 }
00314
00315
00316 bool isUnary = ( tag[tag.size() - 1] == '/' );
00317
00318
00319 bool isClosed = ( tag[0] == '/' );
00320 bool isOpen = !isClosed;
00321
00322 if (isClosed && isUnary) {
00323 cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
00324 return false;
00325 }
00326
00327 if (isClosed)
00328 tag = tag.substr(1);
00329 if (isUnary)
00330 tag = tag.substr(0,tag.size()-1);
00331
00332
00333 string::size_type endOfName = tag.find_first_of(' ');
00334 string tagName = tag;
00335 string tagContent = "";
00336 if (endOfName != string::npos) {
00337 tagName = tag.substr(0,endOfName);
00338 tagContent = tag.substr(endOfName+1);
00339 }
00340
00341
00342
00343 if (isOpen || isUnary) {
00344
00345 OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00346 tagStack.push_back( openedTag );
00347
00348 }
00349
00350
00351
00352 if (isClosed || isUnary) {
00353
00354 if (tagStack.size() == 0) {
00355 cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
00356 return false;
00357 }
00358 OpenedTag openedTag = tagStack.back();
00359 tagStack.pop_back();
00360
00361
00362 if (openedTag.first != tagName) {
00363 cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
00364 return false;
00365 }
00366
00367
00368 size_t startPos = openedTag.second.first;
00369 string tagContent = openedTag.second.second;
00370 size_t endPos = wordPos;
00371
00372
00373 string span = ParseXmlTagAttribute(tagContent,"span");
00374 if (! span.empty()) {
00375 vector<string> ij = Tokenize(span, "-");
00376 if (ij.size() != 1 && ij.size() != 2) {
00377 cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
00378 return false;
00379 }
00380 startPos = atoi(ij[0].c_str());
00381 if (ij.size() == 1) endPos = startPos + 1;
00382 else endPos = atoi(ij[1].c_str()) + 1;
00383 }
00384
00385
00386
00387 if (startPos > endPos) {
00388 cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
00389 return false;
00390 } else if (startPos == endPos) {
00391 cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
00392 continue;
00393 }
00394
00395 string label = ParseXmlTagAttribute(tagContent,"label");
00396 labelCollection.insert( label );
00397
00398
00399 if (0) {
00400 cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
00401 cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
00402 cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
00403 }
00404 SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
00405 ParseXmlTagAttributes(tagContent, node->attributes);
00406 }
00407 }
00408 }
00409
00410 if (tagStack.size() > 0) {
00411 cerr << "ERROR: some opened tags were never closed: " << line << endl;
00412 return false;
00413 }
00414
00415
00416 const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
00417 for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
00418 SyntaxNode *n = *node;
00419 const string &label = n->label;
00420 if (topLabelCollection.find( label ) == topLabelCollection.end())
00421 topLabelCollection[ label ] = 0;
00422 topLabelCollection[ label ]++;
00423 }
00424
00425
00426 line = cleanLine;
00427 return true;
00428 }
00429
00430 }