00001 #include <fstream>
00002
00003 #include "moses/FactorCollection.h"
00004 #include "moses/InputPath.h"
00005 #include "moses/Util.h"
00006
00007 #include "util/exception.hh"
00008
00009 #include "util/file_piece.hh"
00010 #include "util/string_piece.hh"
00011 #include "util/string_stream.hh"
00012 #include "util/tokenize_piece.hh"
00013
00014 #include "LexicalReordering.h"
00015 #include "SparseReordering.h"
00016
00017 #include <boost/algorithm/string/predicate.hpp>
00018
00019
00020 using namespace std;
00021 using namespace boost::algorithm;
00022
00023 namespace Moses
00024 {
00025
00026 const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)
00027 {
00028 static string kSep = "-";
00029 static string name;
00030 util::StringStream buf;
00031
00032 if (type == Phrase) {
00033 buf << "phr";
00034 } else if (type == Stack) {
00035 buf << "stk";
00036 } else if (type == Between) {
00037 buf << "btn";
00038 }
00039 buf << kSep;
00040 if (side == Source) {
00041 buf << "src";
00042 } else if (side == Target) {
00043 buf << "tgt";
00044 }
00045 buf << kSep;
00046 if (position == First) {
00047 buf << "first";
00048 } else if (position == Last) {
00049 buf << "last";
00050 }
00051 buf << kSep;
00052 buf << wordListId;
00053 buf << kSep;
00054 if (isCluster) buf << "cluster_";
00055 buf << word->GetString();
00056 buf << kSep;
00057 buf << reoType;
00058 name = buf.str();
00059 return name;
00060 }
00061
00062 SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
00063 : m_producer(producer)
00064 , m_useWeightMap(false)
00065 {
00066 static const string kSource= "source";
00067 static const string kTarget = "target";
00068 for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) {
00069 vector<string> fields = Tokenize(i->first, "-");
00070 if (fields[0] == "words") {
00071 UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>");
00072 if (fields[1] == kSource) {
00073 ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists);
00074 } else if (fields[1] == kTarget) {
00075 ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists);
00076 } else {
00077 UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
00078 }
00079 } else if (fields[0] == "clusters") {
00080 UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>");
00081 if (fields[1] == kSource) {
00082 ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps);
00083 } else if (fields[1] == kTarget) {
00084 ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps);
00085 } else {
00086 UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
00087 }
00088 } else if (fields[0] == "weights") {
00089 ReadWeightMap(i->second);
00090 m_useWeightMap = true;
00091 for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
00092 util::StringStream buf;
00093 buf << reoType;
00094 m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
00095 }
00096
00097 } else if (fields[0] == "phrase") {
00098 m_usePhrase = true;
00099 } else if (fields[0] == "stack") {
00100 m_useStack = true;
00101 } else if (fields[0] == "between") {
00102 m_useBetween = true;
00103 } else {
00104 UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first);
00105 }
00106 }
00107
00108 }
00109
00110 void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster)
00111 {
00112 for (size_t type = SparseReorderingFeatureKey::Stack;
00113 type <= SparseReorderingFeatureKey::Between; ++type) {
00114 for (size_t position = SparseReorderingFeatureKey::First;
00115 position <= SparseReorderingFeatureKey::Last; ++position) {
00116 for (int reoType = 0; reoType <= LRModel::MAX; ++reoType) {
00117 SparseReorderingFeatureKey
00118 key(index, static_cast<SparseReorderingFeatureKey::Type>(type),
00119 factor, isCluster,
00120 static_cast<SparseReorderingFeatureKey::Position>(position),
00121 side, static_cast<LRModel::ReorderingType>(reoType));
00122 m_featureMap.insert(pair<SparseReorderingFeatureKey, FName>(key,m_producer->GetFeatureName(key.Name(id))));
00123 }
00124 }
00125 }
00126 }
00127
00128 void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists)
00129 {
00130 ifstream fh(filename.c_str());
00131 UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename);
00132 string line;
00133 pWordLists->push_back(WordList());
00134 pWordLists->back().first = id;
00135 while (getline(fh,line)) {
00136
00137 const Factor* factor = FactorCollection::Instance().AddFactor(line);
00138 pWordLists->back().second.insert(factor);
00139 PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false);
00140
00141 }
00142 }
00143
00144 void SparseReordering::ReadClusterMap(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<ClusterMap>* pClusterMaps)
00145 {
00146 pClusterMaps->push_back(ClusterMap());
00147 pClusterMaps->back().first = id;
00148 util::FilePiece file(filename.c_str());
00149 StringPiece line;
00150 while (true) {
00151 try {
00152 line = file.ReadLine();
00153 } catch (const util::EndOfFileException &e) {
00154 break;
00155 }
00156 util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter('\t'));
00157 if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing word): '" << line << "'");
00158 const Factor* wordFactor = FactorCollection::Instance().AddFactor(*lineIter);
00159 ++lineIter;
00160 if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing cluster id): '" << line << "'");
00161 const Factor* idFactor = FactorCollection::Instance().AddFactor(*lineIter);
00162 pClusterMaps->back().second[wordFactor] = idFactor;
00163 PreCalculateFeatureNames(pClusterMaps->size()-1, id, side, idFactor, true);
00164 }
00165 }
00166
00167 void SparseReordering::AddFeatures(
00168 SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
00169 const Word& word, SparseReorderingFeatureKey::Position position,
00170 LRModel::ReorderingType reoType,
00171 ScoreComponentCollection* scores) const
00172 {
00173
00174 const Factor* wordFactor = word.GetFactor(0);
00175
00176 const vector<WordList>* wordLists;
00177 const vector<ClusterMap>* clusterMaps;
00178 if (side == SparseReorderingFeatureKey::Source) {
00179 wordLists = &m_sourceWordLists;
00180 clusterMaps = &m_sourceClusterMaps;
00181 } else {
00182 wordLists = &m_targetWordLists;
00183 clusterMaps = &m_targetClusterMaps;
00184 }
00185
00186 for (size_t id = 0; id < wordLists->size(); ++id) {
00187 if ((*wordLists)[id].second.find(wordFactor) == (*wordLists)[id].second.end()) continue;
00188 SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
00189 FeatureMap::const_iterator fmi = m_featureMap.find(key);
00190 assert(fmi != m_featureMap.end());
00191 if (m_useWeightMap) {
00192 WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
00193 if (wmi != m_weightMap.end()) {
00194 if (wmi->second != 0) {
00195 scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
00196 }
00197 }
00198 } else {
00199 scores->SparsePlusEquals(fmi->second, 1.0);
00200 }
00201 }
00202
00203 for (size_t id = 0; id < clusterMaps->size(); ++id) {
00204 const ClusterMap& clusterMap = (*clusterMaps)[id];
00205 boost::unordered_map<const Factor*, const Factor*>::const_iterator clusterIter
00206 = clusterMap.second.find(wordFactor);
00207 if (clusterIter != clusterMap.second.end()) {
00208 SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
00209 FeatureMap::const_iterator fmi = m_featureMap.find(key);
00210 assert(fmi != m_featureMap.end());
00211 if (m_useWeightMap) {
00212 WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
00213 if (wmi != m_weightMap.end()) {
00214 if (wmi->second != 0) {
00215 scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
00216 }
00217 }
00218 } else {
00219 scores->SparsePlusEquals(fmi->second, 1.0);
00220 }
00221 }
00222 }
00223
00224 }
00225
00226 void SparseReordering::CopyScores(
00227 const TranslationOption& currentOpt,
00228 const TranslationOption* previousOpt,
00229 const InputType& input,
00230 LRModel::ReorderingType reoType,
00231 LRModel::Direction direction,
00232 ScoreComponentCollection* scores) const
00233 {
00234 if (m_useBetween && direction == LRModel::Backward &&
00235 (reoType == LRModel::D || reoType == LRModel::DL || reoType == LRModel::DR)) {
00236 size_t gapStart, gapEnd;
00237
00238
00239 const Sentence& sentence = static_cast<const Sentence&>(input);
00240 const Range& currentRange = currentOpt.GetSourceWordsRange();
00241 if (previousOpt) {
00242 const Range& previousRange = previousOpt->GetSourceWordsRange();
00243 if (previousRange < currentRange) {
00244 gapStart = previousRange.GetEndPos() + 1;
00245 gapEnd = currentRange.GetStartPos();
00246 } else {
00247 gapStart = currentRange.GetEndPos() + 1;
00248 gapEnd = previousRange.GetStartPos();
00249 }
00250 } else {
00251
00252 gapStart = 0;
00253 gapEnd = currentRange.GetStartPos();
00254 }
00255 assert(gapStart < gapEnd);
00256 for (size_t i = gapStart; i < gapEnd; ++i) {
00257 AddFeatures(SparseReorderingFeatureKey::Between,
00258 SparseReorderingFeatureKey::Source, sentence.GetWord(i),
00259 SparseReorderingFeatureKey::First, reoType, scores);
00260 }
00261 }
00262
00263
00264
00265 SparseReorderingFeatureKey::Type type;
00266 if (direction == LRModel::Forward) {
00267 if (!m_useStack) return;
00268 type = SparseReorderingFeatureKey::Stack;
00269 } else if (direction == LRModel::Backward) {
00270 if (!m_usePhrase) return;
00271 type = SparseReorderingFeatureKey::Phrase;
00272 } else {
00273
00274
00275 type = SparseReorderingFeatureKey::Phrase;
00276 assert(!"Shouldn't call CopyScores() with bidirectional direction");
00277 }
00278 const Phrase& sourcePhrase = currentOpt.GetInputPath().GetPhrase();
00279 AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(0),
00280 SparseReorderingFeatureKey::First, reoType, scores);
00281 AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(sourcePhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
00282 const Phrase& targetPhrase = currentOpt.GetTargetPhrase();
00283 AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(0),
00284 SparseReorderingFeatureKey::First, reoType, scores);
00285 AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(targetPhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
00286
00287
00288 }
00289
00290
00291 void SparseReordering::ReadWeightMap(const string& filename)
00292 {
00293 util::FilePiece file(filename.c_str());
00294 StringPiece line;
00295 while (true) {
00296 try {
00297 line = file.ReadLine();
00298 } catch (const util::EndOfFileException &e) {
00299 break;
00300 }
00301 util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
00302 UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
00303 const std::string& name = lineIter->as_string();
00304 ++lineIter;
00305 UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
00306 float weight = Moses::Scan<float>(lineIter->as_string());
00307
00308 std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
00309 UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
00310 }
00311 }
00312
00313
00314 }
00315