00001
00002
00003 #include "TreeInput.h"
00004 #include "StaticData.h"
00005 #include "Util.h"
00006 #include "XmlOption.h"
00007 #include "FactorCollection.h"
00008 #include "moses/TranslationModel/PhraseDictionary.h"
00009
00010 using namespace std;
00011
00012 namespace Moses
00013 {
00014
00025 bool
00026 TreeInput::
00027 ProcessAndStripXMLTags(AllOptions const& opts, string &line,
00028 std::vector<XMLParseOutput> &sourceLabels,
00029 std::vector<XmlOption const*> &xmlOptions)
00030 {
00031
00032
00033 vector<FactorType> const& oFactors = opts.output.factor_order;
00034
00035
00036 if (line.find_first_of('<') == string::npos) {
00037 return true;
00038 }
00039
00040
00041 PhraseDictionary *firstPt = NULL;
00042 if (PhraseDictionary::GetColl().size() == 0) {
00043 firstPt = PhraseDictionary::GetColl()[0];
00044 }
00045
00046
00047
00048 vector<string> xmlTokens = TokenizeXml(line);
00049
00050
00051
00052 typedef pair< string, pair< size_t, string > > OpenedTag;
00053 vector< OpenedTag > tagStack;
00054
00055 string cleanLine;
00056 size_t wordPos = 0;
00057
00058
00059 for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00060
00061 if(!isXmlTag(xmlTokens[xmlTokenPos])) {
00062
00063 if (cleanLine.size()>0 &&
00064 cleanLine[cleanLine.size() - 1] != ' ' &&
00065 xmlTokens[xmlTokenPos][0] != ' ') {
00066 cleanLine += " ";
00067 }
00068 cleanLine += xmlTokens[xmlTokenPos];
00069 wordPos = Tokenize(cleanLine).size();
00070 }
00071
00072
00073 else {
00074
00075
00076
00077 string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
00078 VERBOSE(3,"XML TAG IS: " << tag << std::endl);
00079
00080 if (tag.size() == 0) {
00081 TRACE_ERR("ERROR: empty tag name: " << line << endl);
00082 return false;
00083 }
00084
00085
00086 bool isUnary = ( tag[tag.size() - 1] == '/' );
00087
00088
00089 bool isClosed = ( tag[0] == '/' );
00090 bool isOpen = !isClosed;
00091
00092 if (isClosed && isUnary) {
00093 TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
00094 return false;
00095 }
00096
00097 if (isClosed)
00098 tag = tag.substr(1);
00099 if (isUnary)
00100 tag = tag.substr(0,tag.size()-1);
00101
00102
00103 string::size_type endOfName = tag.find_first_of(' ');
00104 string tagName = tag;
00105 string tagContent = "";
00106 if (endOfName != string::npos) {
00107 tagName = tag.substr(0,endOfName);
00108 tagContent = tag.substr(endOfName+1);
00109 }
00110
00111
00112
00113 if (isOpen || isUnary) {
00114
00115 OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00116 tagStack.push_back( openedTag );
00117 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
00118 }
00119
00120
00121
00122 if (isClosed || isUnary) {
00123
00124 if (tagStack.size() == 0) {
00125 TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
00126 return false;
00127 }
00128 OpenedTag openedTag = tagStack.back();
00129 tagStack.pop_back();
00130
00131
00132 if (openedTag.first != tagName) {
00133 TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
00134 return false;
00135 }
00136
00137
00138 size_t startPos = openedTag.second.first;
00139 string tagContent = openedTag.second.second;
00140 size_t endPos = wordPos;
00141
00142
00143 string span = ParseXmlTagAttribute(tagContent,"span");
00144 if (! span.empty()) {
00145 vector<string> ij = Tokenize(span, "-");
00146 if (ij.size() != 1 && ij.size() != 2) {
00147 TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
00148 return false;
00149 }
00150 startPos = atoi(ij[0].c_str());
00151 if (ij.size() == 1) endPos = startPos + 1;
00152 else endPos = atoi(ij[1].c_str()) + 1;
00153 }
00154
00155 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
00156
00157 if (startPos == endPos) {
00158 TRACE_ERR("WARNING: tag " << tagName << " span is empty. Ignoring: " << line << endl);
00159 continue;
00160 } else if (startPos > endPos) {
00161 TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
00162 return false;
00163 }
00164
00165
00166 string label = ParseXmlTagAttribute(tagContent,"label");
00167 string translation = ParseXmlTagAttribute(tagContent,"translation");
00168
00169
00170 if (translation.length() == 0 && label.length() > 0) {
00171 Range range(startPos,endPos-1);
00172 XMLParseOutput item(label, range);
00173 sourceLabels.push_back(item);
00174 }
00175
00176
00177 if (translation.length() > 0 && opts.input.xml_policy != XmlIgnore) {
00178 vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
00179 vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
00180 vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
00181
00182 for (size_t i=0; i<altTexts.size(); ++i) {
00183
00184 TargetPhrase targetPhrase(firstPt);
00185 targetPhrase.CreateFromString(Output, oFactors, altTexts[i], NULL);
00186
00187
00188 string targetLHSstr;
00189 if (altLabel.size() > i && altLabel[i].size() > 0) {
00190 targetLHSstr = altLabel[i];
00191 } else {
00192 const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
00193 UnknownLHSList::const_iterator iterLHS = lhsList.begin();
00194 targetLHSstr = iterLHS->first;
00195 }
00196 Word *targetLHS = new Word(true);
00197 targetLHS->CreateFromString(Output, oFactors, targetLHSstr, true);
00198 UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL,
00199 "Null factor left-hand-side");
00200 targetPhrase.SetTargetLHS(targetLHS);
00201
00202
00203 Phrase sourcePhrase = this->GetSubString(Range(startPos,endPos-1));
00204
00205
00206 float probValue = 1;
00207 if (altProbs.size() > i && altProbs[i].size() > 0) {
00208 probValue = Scan<float>(altProbs[i]);
00209 }
00210
00211 float scoreValue = FloorScore(TransformScore(probValue));
00212 targetPhrase.SetXMLScore(scoreValue);
00213 targetPhrase.EvaluateInIsolation(sourcePhrase);
00214
00215
00216 Range range(startPos+1,endPos);
00217 XmlOption *option = new XmlOption(range,targetPhrase);
00218 assert(option);
00219 xmlOptions.push_back(option);
00220
00221 VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
00222 }
00223 altTexts.clear();
00224 altProbs.clear();
00225 }
00226 }
00227 }
00228 }
00229
00230 if (tagStack.size() > 0) {
00231 TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
00232 return false;
00233 }
00234
00235
00236 line = cleanLine;
00237 return true;
00238 }
00239
00241 int
00242 TreeInput::
00243 Read(std::istream& in)
00244 {
00245 string line;
00246 if (getline(in, line, '\n').eof())
00247 return 0;
00248 m_labelledSpans.clear();
00249 ProcessAndStripXMLTags(*m_options, line, m_labelledSpans, m_xmlOptions);
00250
00251
00252 stringstream strme;
00253 strme << line << endl;
00254
00255 Sentence::Read(strme);
00256
00257
00258 size_t sourceSize = GetSize();
00259 m_sourceChart.resize(sourceSize);
00260
00261 for (size_t pos = 0; pos < sourceSize; ++pos) {
00262 m_sourceChart[pos].resize(sourceSize - pos);
00263 }
00264
00265
00266 vector<XMLParseOutput>::const_iterator iterLabel;
00267 for (iterLabel = m_labelledSpans.begin();
00268 iterLabel != m_labelledSpans.end(); ++iterLabel) {
00269 const XMLParseOutput &labelItem = *iterLabel;
00270 const Range &range = labelItem.m_range;
00271 const string &label = labelItem.m_label;
00272 AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label);
00273 }
00274
00275
00276 bool only4empty = m_options->syntax.default_non_term_only_for_empty_range;
00277 for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
00278 for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
00279 NonTerminalSet &list = GetLabelSet(startPos, endPos);
00280 if (list.size() == 0 || ! only4empty ) {
00281 AddChartLabel(startPos, endPos, m_options->syntax.input_default_non_terminal);
00282 }
00283 }
00284 }
00285
00286 return 1;
00287 }
00288
00290 void TreeInput::Print(std::ostream &out) const
00291 {
00292 out << *this << "\n";
00293 }
00294
00296 TranslationOptionCollection* TreeInput::CreateTranslationOptionCollection() const
00297 {
00298
00299 return NULL;
00300 }
00301
00302 void
00303 TreeInput::
00304 AddChartLabel(size_t startPos, size_t endPos, const Word &label)
00305 {
00306 UTIL_THROW_IF2(!label.IsNonTerminal(),
00307 "Label must be a non-terminal");
00308 SourceLabelOverlap overlapType = m_options->syntax.source_label_overlap;
00309 NonTerminalSet &list = GetLabelSet(startPos, endPos);
00310 switch (overlapType) {
00311 case SourceLabelOverlapAdd:
00312 list.insert(label);
00313 break;
00314 case SourceLabelOverlapReplace:
00315 if (list.size() > 0)
00316 list.clear();
00317 list.insert(label);
00318 break;
00319 case SourceLabelOverlapDiscard:
00320 if (list.size() == 0)
00321 list.insert(label);
00322 break;
00323 }
00324 }
00325
00326 void
00327 TreeInput::
00328 AddChartLabel(size_t startPos, size_t endPos, const string &label)
00329 {
00330 const std::vector<FactorType>& fOrder = m_options->input.factor_order;
00331 Word word(true);
00332 const Factor *factor
00333 = FactorCollection::Instance().AddFactor(Input, fOrder[0], label, true);
00334
00335 word.SetFactor(0, factor);
00336 AddChartLabel(startPos, endPos, word);
00337 }
00338
00339 std::ostream& operator<<(std::ostream &out, const TreeInput &input)
00340 {
00341 out<< static_cast<Phrase const&>(input) << " ||| ";
00342
00343 size_t size = input.GetSize();
00344 for (size_t startPos = 0; startPos < size; ++startPos) {
00345 for (size_t endPos = startPos; endPos < size; ++endPos) {
00346 const NonTerminalSet &labelSet = input.GetLabelSet(startPos, endPos);
00347 NonTerminalSet::const_iterator iter;
00348 for (iter = labelSet.begin(); iter != labelSet.end(); ++iter) {
00349 const Word &word = *iter;
00350 UTIL_THROW_IF2(!word.IsNonTerminal(),
00351 "Word must be a non-terminal");
00352 out << "[" << startPos <<"," << endPos << "]="
00353 << word << "(" << word.IsNonTerminal() << ") ";
00354 }
00355 }
00356 }
00357
00358 return out;
00359 }
00360
00361
00362 }
00363