00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <stdexcept>
00024 #include <boost/algorithm/string.hpp>
00025 #include <boost/foreach.hpp>
00026
00027 #include "Sentence.h"
00028 #include "TranslationOptionCollectionText.h"
00029 #include "StaticData.h"
00030 #include "moses/FF/DynamicCacheBasedLanguageModel.h"
00031 #include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
00032 #include "ChartTranslationOptions.h"
00033 #include "Util.h"
00034 #include "XmlOption.h"
00035 #include "FactorCollection.h"
00036 #include "TranslationTask.h"
00037
00038 using namespace std;
00039
00040 namespace Moses
00041 {
00042
00043 Sentence::
00044 Sentence(AllOptions::ptr const& opts) : Phrase(0) , InputType(opts)
00045 {
00046 if (is_syntax(opts->search.algo))
00047 m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
00048 }
00049
00050 Sentence::
00051 ~Sentence()
00052 {
00053 RemoveAllInColl(m_xmlOptions);
00054 }
00055
00056 void
00057 Sentence::
00058 aux_init_partial_translation(string& line)
00059 {
00060 string sourceCompletedStr;
00061 int loc1 = line.find( "|||", 0 );
00062 int loc2 = line.find( "|||", loc1 + 3 );
00063 if (loc1 > -1 && loc2 > -1) {
00064 m_initialTargetPhrase = Trim(line.substr(0, loc1));
00065 string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
00066 line = line.substr(loc2 + 3);
00067
00068 m_sourceCompleted.resize(scov.size());
00069 int contiguous = 1;
00070 for (size_t i = 0; i < scov.size(); ++i) {
00071 if (sourceCompletedStr.at(i) == '1') {
00072 m_sourceCompleted[i] = true;
00073 if (contiguous) m_frontSpanCoveredLength++;
00074 } else {
00075 m_sourceCompleted[i] = false;
00076 contiguous = 0;
00077 }
00078 }
00079 }
00080 }
00081
00082 void
00083 Sentence::
00084 aux_interpret_sgml_markup(string& line)
00085 {
00086
00087 typedef std::map<std::string, std::string> metamap;
00088 metamap meta = ProcessAndStripSGML(line);
00089 metamap::const_iterator i;
00090 if ((i = meta.find("id")) != meta.end())
00091 this->SetTranslationId(atol(i->second.c_str()));
00092 if ((i = meta.find("docid")) != meta.end()) {
00093 this->SetDocumentId(atol(i->second.c_str()));
00094 this->SetUseTopicId(false);
00095 this->SetUseTopicIdAndProb(false);
00096 }
00097 if ((i = meta.find("topic")) != meta.end()) {
00098 vector<string> topic_params;
00099 boost::split(topic_params, i->second, boost::is_any_of("\t "));
00100 if (topic_params.size() == 1) {
00101 this->SetTopicId(atol(topic_params[0].c_str()));
00102 this->SetUseTopicId(true);
00103 this->SetUseTopicIdAndProb(false);
00104 } else {
00105 this->SetTopicIdAndProb(topic_params);
00106 this->SetUseTopicId(false);
00107 this->SetUseTopicIdAndProb(true);
00108 }
00109 }
00110 if ((i = meta.find("weight-setting")) != meta.end()) {
00111 this->SetWeightSetting(i->second);
00112 this->SetSpecifiesWeightSetting(true);
00113 StaticData::Instance().SetWeightSetting(i->second);
00114
00115
00116 } else this->SetSpecifiesWeightSetting(false);
00117 }
00118
00119 void
00120 Sentence::
00121 aux_interpret_dlt(string& line)
00122 {
00123 using namespace std;
00124 typedef map<string, string> str2str_map;
00125 m_dlt_meta = ProcessAndStripDLT(line);
00126
00127 BOOST_FOREACH(str2str_map const& M, m_dlt_meta) {
00128 str2str_map::const_iterator i,j;
00129 if ((i = M.find("type")) != M.end()) {
00130 j = M.find("id");
00131 string id = j == M.end() ? "default" : j->second;
00132 if (i->second == "cbtm") {
00133 PhraseDictionaryDynamicCacheBased* cbtm;
00134 cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
00135 if (cbtm) cbtm->ExecuteDlt(M);
00136 }
00137 if (i->second == "cblm") {
00138 DynamicCacheBasedLanguageModel* cblm;
00139 cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
00140 if (cblm) cblm->ExecuteDlt(M);
00141 }
00142 }
00143 }
00144 }
00145
00146 void
00147 Sentence::
00148 aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
00149 std::vector<std::pair<size_t, std::string> >& placeholders)
00150 {
00151
00152 using namespace std;
00153 if (m_options->input.xml_policy != XmlPassThrough) {
00154 bool OK = ProcessAndStripXMLTags(*m_options, line,
00155 m_xmlOptions,
00156 m_reorderingConstraint,
00157 xmlWalls, placeholders,
00158 *this);
00159 if (!OK) {
00160 TRACE_ERR("Unable to parse XML in line: " << line);
00161 }
00162 }
00163 }
00164
00165 void
00166 Sentence::
00167 init(string line)
00168 {
00169 using namespace std;
00170
00171 m_frontSpanCoveredLength = 0;
00172 m_sourceCompleted.resize(0);
00173
00174 if (m_options->input.continue_partial_translation)
00175 aux_init_partial_translation(line);
00176
00177 line = Trim(line);
00178 aux_interpret_sgml_markup(line);
00179 aux_interpret_dlt(line);
00180
00181
00182 if (m_options->output.PrintPassThrough ||m_options->nbest.include_passthrough) {
00183 string pthru = PassthroughSGML(line,"passthrough");
00184 this->SetPassthroughInformation(pthru);
00185 }
00186
00187 vector<size_t> xmlWalls;
00188 vector<pair<size_t, string> >placeholders;
00189 aux_interpret_xml(line, xmlWalls, placeholders);
00190
00191 Phrase::CreateFromString(Input, m_options->input.factor_order, line, NULL);
00192
00193 ProcessPlaceholders(placeholders);
00194
00195 if (is_syntax(m_options->search.algo))
00196 InitStartEndWord();
00197
00198
00199
00200
00201
00202
00203 if (m_options->input.xml_policy != XmlPassThrough) {
00204 m_xmlCoverageMap.assign(GetSize(), false);
00205 BOOST_FOREACH(XmlOption const* o, m_xmlOptions) {
00206 Range const& r = o->range;
00207 for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
00208 m_xmlCoverageMap[j]=true;
00209 }
00210 }
00211
00212
00213 m_reorderingConstraint.InitializeWalls(GetSize());
00214
00215
00216 if (m_options->reordering.monotone_at_punct && GetSize()) {
00217 Range r(0, GetSize()-1);
00218 m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
00219 }
00220
00221
00222 for(size_t i=0; i<xmlWalls.size(); i++)
00223 if(xmlWalls[i] < GetSize())
00224 m_reorderingConstraint.SetWall(xmlWalls[i], true);
00225 m_reorderingConstraint.FinalizeWalls();
00226
00227 }
00228
00229 int
00230 Sentence::
00231 Read(std::istream& in)
00232 {
00233 std::string line;
00234 if (getline(in, line, '\n').eof())
00235 return 0;
00236 init(line);
00237 return 1;
00238 }
00239
00240 void
00241 Sentence::
00242 ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
00243 {
00244 FactorType placeholderFactor = m_options->input.placeholder_factor;
00245 if (placeholderFactor == NOT_FOUND) {
00246 return;
00247 }
00248
00249 for (size_t i = 0; i < placeholders.size(); ++i) {
00250 size_t pos = placeholders[i].first;
00251 const string &str = placeholders[i].second;
00252 const Factor *factor = FactorCollection::Instance().AddFactor(str);
00253 Word &word = Phrase::GetWord(pos);
00254 word[placeholderFactor] = factor;
00255 }
00256 }
00257
00258 TranslationOptionCollection*
00259 Sentence::
00260 CreateTranslationOptionCollection(ttasksptr const& ttask) const
00261 {
00262 TranslationOptionCollection *rv
00263 = new TranslationOptionCollectionText(ttask, *this);
00264 assert(rv);
00265 return rv;
00266 }
00267 void Sentence::Print(std::ostream& out) const
00268 {
00269 out<<*static_cast<Phrase const*>(this);
00270 }
00271
00272
00273 bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const
00274 {
00275 for (size_t pos = startPos; pos <= endPos ; pos++) {
00276 if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
00277 return true;
00278 }
00279 }
00280 return false;
00281 }
00282
00283 void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list) const
00284 {
00285 for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
00286 iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
00287 const XmlOption &xmlOption = **iterXMLOpts;
00288 const Range &range = xmlOption.range;
00289 const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
00290 TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
00291 list.push_back(transOpt);
00292 }
00293 }
00294
00295 void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const
00296 {
00297
00298
00299 for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
00300 iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
00301 const XmlOption &xmlOption = **iterXMLOpts;
00302 const Range &range = xmlOption.range;
00303
00304 if (startPos == range.GetStartPos()
00305 && endPos == range.GetEndPos()) {
00306 const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
00307
00308 TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
00309 list.push_back(transOpt);
00310 }
00311 }
00312 }
00313
00314 std::vector <ChartTranslationOptions*>
00315 Sentence::
00316 GetXmlChartTranslationOptions() const
00317 {
00318 std::vector <ChartTranslationOptions*> ret;
00319
00320
00321
00322
00323
00324 if (m_options->input.xml_policy != XmlPassThrough ) {
00325
00326
00327
00328
00329
00330
00331
00332 for(std::vector<XmlOption const*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
00333 iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {
00334
00335 const XmlOption &xmlOption = **iterXmlOpts;
00336 TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);
00337
00338 Range *range = new Range(xmlOption.range);
00339 StackVec emptyStackVec;
00340
00341 TargetPhraseCollection *tpc = new TargetPhraseCollection;
00342 tpc->Add(targetPhrase);
00343
00344 ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
00345 ret.push_back(transOpt);
00346
00347
00348
00349
00350
00351 }
00352 }
00353
00354 return ret;
00355 }
00356
00357 void
00358 Sentence::
00359 CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
00360 {
00361 Phrase::CreateFromString(Input, FOrder, phraseString, NULL);
00362 }
00363
00364 Sentence::
00365 Sentence(AllOptions::ptr const& opts, size_t const transId, string stext)
00366 : InputType(opts, transId)
00367 {
00368 init(stext);
00369 }
00370
00371 }
00372