00001 #include <iostream>
00002
00003 #include "moses/ChartHypothesis.h"
00004 #include "moses/ChartManager.h"
00005 #include "moses/FactorCollection.h"
00006 #include "moses/Sentence.h"
00007
00008 #include "util/exception.hh"
00009 #include "util/string_stream.hh"
00010
00011 #include "SparseHieroReorderingFeature.h"
00012
00013 using namespace std;
00014
00015 namespace Moses
00016 {
00017
00018 SparseHieroReorderingFeature::SparseHieroReorderingFeature(const std::string &line)
00019 :StatelessFeatureFunction(0, line),
00020 m_type(SourceCombined),
00021 m_sourceFactor(0),
00022 m_targetFactor(0),
00023 m_sourceVocabFile(""),
00024 m_targetVocabFile("")
00025 {
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 cerr << "Constructing a Sparse Reordering feature" << endl;
00037 ReadParameters();
00038 m_otherFactor = FactorCollection::Instance().AddFactor("##OTHER##");
00039 LoadVocabulary(m_sourceVocabFile, m_sourceVocab);
00040 LoadVocabulary(m_targetVocabFile, m_targetVocab);
00041 }
00042
00043 void SparseHieroReorderingFeature::SetParameter(const std::string& key, const std::string& value)
00044 {
00045 if (key == "input-factor") {
00046 m_sourceFactor = Scan<FactorType>(value);
00047 } else if (key == "output-factor") {
00048 m_targetFactor = Scan<FactorType>(value);
00049 } else if (key == "input-vocab-file") {
00050 m_sourceVocabFile = value;
00051 } else if (key == "output-vocab-file") {
00052 m_targetVocabFile = value;
00053 } else if (key == "type") {
00054 if (value == "SourceCombined") {
00055 m_type = SourceCombined;
00056 } else if (value == "SourceLeft") {
00057 m_type = SourceLeft;
00058 } else if (value == "SourceRight") {
00059 m_type = SourceRight;
00060 } else {
00061 UTIL_THROW(util::Exception, "Unknown sparse reordering type " << value);
00062 }
00063 } else {
00064 FeatureFunction::SetParameter(key, value);
00065 }
00066 }
00067
00068 void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
00069 {
00070 if (filename.empty()) return;
00071 ifstream in(filename.c_str());
00072 UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
00073 string line;
00074 while(getline(in,line)) {
00075 vocab.insert(FactorCollection::Instance().AddFactor(line));
00076 }
00077 in.close();
00078 }
00079
00080 const Factor* SparseHieroReorderingFeature::GetFactor(const Word& word, const Vocab& vocab, FactorType factorType) const
00081 {
00082 const Factor* factor = word.GetFactor(factorType);
00083 if (vocab.size() && vocab.find(factor) == vocab.end()) return m_otherFactor;
00084 return factor;
00085 }
00086
00087 void SparseHieroReorderingFeature::EvaluateWhenApplied(
00088 const ChartHypothesis& cur_hypo ,
00089 ScoreComponentCollection* accumulator) const
00090 {
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112 size_t sourceStart = cur_hypo.GetCurrSourceRange().GetStartPos();
00113 size_t sourceSize = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
00114
00115 vector<Range> sourceNTSpans;
00116 for (size_t prevHypoId = 0; prevHypoId < cur_hypo.GetPrevHypos().size(); ++prevHypoId) {
00117 sourceNTSpans.push_back(cur_hypo.GetPrevHypo(prevHypoId)->GetCurrSourceRange());
00118 }
00119
00120 sort(sourceNTSpans.begin(), sourceNTSpans.end());
00121
00122
00123
00124
00125 typedef pair<Range,bool> Block;
00126 vector<Block> sourceBlocks;
00127 sourceBlocks.push_back(Block(cur_hypo.GetCurrSourceRange(),false));
00128 for (vector<Range>::const_iterator i = sourceNTSpans.begin();
00129 i != sourceNTSpans.end(); ++i) {
00130 const Range& prevHypoRange = *i;
00131 Block lastBlock = sourceBlocks.back();
00132 sourceBlocks.pop_back();
00133
00134 if (prevHypoRange.GetStartPos() > lastBlock.first.GetStartPos()) {
00135 sourceBlocks.push_back(Block(Range(lastBlock.first.GetStartPos(),prevHypoRange.GetStartPos()-1),false));
00136 }
00137 sourceBlocks.push_back(Block(prevHypoRange,true));
00138 if (prevHypoRange.GetEndPos() < lastBlock.first.GetEndPos()) {
00139 sourceBlocks.push_back(Block(Range(prevHypoRange.GetEndPos()+1,lastBlock.first.GetEndPos()), false));
00140 }
00141 }
00142
00143
00144
00145
00146
00147
00148
00149
00150 vector<size_t> sourceWordToTargetRulePos(sourceSize);
00151 map<size_t,size_t> alignMap;
00152 alignMap.insert(
00153 cur_hypo.GetCurrTargetPhrase().GetAlignTerm().begin(),
00154 cur_hypo.GetCurrTargetPhrase().GetAlignTerm().end());
00155 alignMap.insert(
00156 cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().begin(),
00157 cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().end());
00158
00159 size_t sourceRulePos = 0;
00160
00161 for (vector<Block>::const_iterator sourceBlockIt = sourceBlocks.begin();
00162 sourceBlockIt != sourceBlocks.end(); ++sourceBlockIt) {
00163 for (size_t sourceWordPos = sourceBlockIt->first.GetStartPos();
00164 sourceWordPos <= sourceBlockIt->first.GetEndPos(); ++sourceWordPos) {
00165 sourceWordToTargetRulePos[sourceWordPos - sourceStart] = alignMap[sourceRulePos];
00166
00167 if (! sourceBlockIt->second) {
00168
00169 ++sourceRulePos;
00170 }
00171 }
00172 if ( sourceBlockIt->second) {
00173
00174 ++sourceRulePos;
00175 }
00176 }
00177
00178
00179
00180 const Sentence& sentence =
00181 static_cast<const Sentence&>(cur_hypo.GetManager().GetSource());
00182
00183 for (size_t i = 0; i < sourceBlocks.size()-1; ++i) {
00184 Block& leftSourceBlock = sourceBlocks[i];
00185 Block& rightSourceBlock = sourceBlocks[i+1];
00186 size_t sourceLeftBoundaryPos = leftSourceBlock.first.GetEndPos();
00187 size_t sourceRightBoundaryPos = rightSourceBlock.first.GetStartPos();
00188 const Word& sourceLeftBoundaryWord = sentence.GetWord(sourceLeftBoundaryPos);
00189 const Word& sourceRightBoundaryWord = sentence.GetWord(sourceRightBoundaryPos);
00190 sourceLeftBoundaryPos -= sourceStart;
00191 sourceRightBoundaryPos -= sourceStart;
00192
00193
00194 size_t targetLeftRulePos =
00195 sourceWordToTargetRulePos[sourceLeftBoundaryPos];
00196 size_t targetRightRulePos =
00197 sourceWordToTargetRulePos[sourceRightBoundaryPos];
00198
00199 bool isMonotone = true;
00200 if ((sourceLeftBoundaryPos < sourceRightBoundaryPos &&
00201 targetLeftRulePos > targetRightRulePos) ||
00202 ((sourceLeftBoundaryPos > sourceRightBoundaryPos &&
00203 targetLeftRulePos < targetRightRulePos))) {
00204 isMonotone = false;
00205 }
00206 util::StringStream buf;
00207 buf << "h_";
00208 if (m_type == SourceLeft || m_type == SourceCombined) {
00209 buf << GetFactor(sourceLeftBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
00210 buf << "_";
00211 }
00212 if (m_type == SourceRight || m_type == SourceCombined) {
00213 buf << GetFactor(sourceRightBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
00214 buf << "_";
00215 }
00216 buf << (isMonotone ? "M" : "S");
00217 accumulator->PlusEquals(this,buf.str(), 1);
00218 }
00219
00220 }
00221
00222
00223 }
00224