00001 #include "Model1Feature.h"
00002 #include "moses/StaticData.h"
00003 #include "moses/InputFileStream.h"
00004 #include "moses/ScoreComponentCollection.h"
00005 #include "moses/FactorCollection.h"
00006
00007
00008 using namespace std;
00009
00010 namespace Moses
00011 {
00012
00013 const std::string Model1Vocabulary::GIZANULL = "GIZANULL";
00014
00015 Model1Vocabulary::Model1Vocabulary()
00016 {
00017 FactorCollection &factorCollection = FactorCollection::Instance();
00018 m_NULL = factorCollection.AddFactor(GIZANULL,false);
00019 Store(m_NULL,0);
00020 }
00021
00022 bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
00023 {
00024 boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
00025 if ( iter != m_lookup.end() ) {
00026 return false;
00027 }
00028 m_lookup[ word ] = id;
00029 if ( m_vocab.size() <= id ) {
00030 m_vocab.resize(id+1);
00031 }
00032 m_vocab[id] = word;
00033 return true;
00034 }
00035
00036 unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
00037 {
00038 boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
00039
00040 if ( iter != m_lookup.end() ) {
00041 return iter->second;
00042 }
00043
00044 unsigned id = m_vocab.size();
00045 m_vocab.push_back( word );
00046 m_lookup[ word ] = id;
00047 return id;
00048 }
00049
00050 unsigned Model1Vocabulary::GetWordID(const Factor* word) const
00051 {
00052 boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
00053 if ( iter == m_lookup.end() ) {
00054 return INVALID_ID;
00055 }
00056 return iter->second;
00057 }
00058
00059 const Factor* Model1Vocabulary::GetWord(unsigned id) const
00060 {
00061 if (id >= m_vocab.size()) {
00062 return NULL;
00063 }
00064 return m_vocab[ id ];
00065 }
00066
00067 void Model1Vocabulary::Load(const std::string& fileName)
00068 {
00069 InputFileStream inFile(fileName);
00070 FactorCollection &factorCollection = FactorCollection::Instance();
00071 std::string line;
00072
00073 unsigned i = 0;
00074 if ( getline(inFile, line) ) {
00075 ++i;
00076 std::vector<std::string> tokens = Tokenize(line);
00077 UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00078 unsigned id = atoll( tokens[0].c_str() );
00079 if (! ( (id == 1) && (tokens[1] == "UNK") )) {
00080 const Factor* factor = factorCollection.AddFactor(tokens[1],false);
00081 bool stored = Store(factor, id);
00082 UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
00083 }
00084 }
00085 while ( getline(inFile, line) ) {
00086 ++i;
00087 std::vector<std::string> tokens = Tokenize(line);
00088 UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00089 unsigned id = atoll( tokens[0].c_str() );
00090 const Factor* factor = factorCollection.AddFactor(tokens[1],false);
00091 bool stored = Store(factor, id);
00092 UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
00093 }
00094 inFile.Close();
00095 }
00096
00097
00098 void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT)
00099 {
00100 InputFileStream inFile(fileName);
00101 std::string line;
00102
00103 unsigned i = 0;
00104 while ( getline(inFile, line) ) {
00105 ++i;
00106 std::vector<std::string> tokens = Tokenize(line);
00107 UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00108 unsigned idS = atoll( tokens[0].c_str() );
00109 unsigned idT = atoll( tokens[1].c_str() );
00110 const Factor* wordS = vcbS.GetWord(idS);
00111 const Factor* wordT = vcbT.GetWord(idT);
00112 float prob = std::atof( tokens[2].c_str() );
00113 if ( (wordS != NULL) && (wordT != NULL) ) {
00114 m_ltable[ wordS ][ wordT ] = prob;
00115 }
00116 UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary.");
00117 }
00118 inFile.Close();
00119 }
00120
00121
00122 float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
00123 {
00124 float prob = m_floor;
00125
00126 boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );
00127
00128 if ( iter1 != m_ltable.end() ) {
00129 boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
00130 if ( iter2 != iter1->second.end() ) {
00131 prob = iter2->second;
00132 if ( prob < m_floor ) {
00133 prob = m_floor;
00134 }
00135 }
00136 }
00137 return prob;
00138 }
00139
00140
00141 Model1Feature::Model1Feature(const std::string &line)
00142 : StatelessFeatureFunction(1, line)
00143 , m_skipTargetPunctuation(false)
00144 , m_is_syntax(false)
00145 {
00146 VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
00147 ReadParameters();
00148 VERBOSE(1, " Done.");
00149 }
00150
00151 void Model1Feature::SetParameter(const std::string& key, const std::string& value)
00152 {
00153 if (key == "path") {
00154 m_fileNameModel1 = value;
00155 } else if (key == "source-vocabulary") {
00156 m_fileNameVcbS = value;
00157 } else if (key == "target-vocabulary") {
00158 m_fileNameVcbT = value;
00159 } else if (key == "skip-target-punctuation") {
00160 m_skipTargetPunctuation = Scan<bool>(value);
00161 } else {
00162 StatelessFeatureFunction::SetParameter(key, value);
00163 }
00164 }
00165
00166 void Model1Feature::Load(AllOptions::ptr const& opts)
00167 {
00168 m_options = opts;
00169 m_is_syntax = is_syntax(opts->search.algo);
00170
00171 FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
00172 Model1Vocabulary vcbS;
00173 vcbS.Load(m_fileNameVcbS);
00174 FEATUREVERBOSE2(2, " Done." << std::endl);
00175 FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ...");
00176 Model1Vocabulary vcbT;
00177 vcbT.Load(m_fileNameVcbT);
00178 FEATUREVERBOSE2(2, " Done." << std::endl);
00179 FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ...");
00180 m_model1.Load(m_fileNameModel1,vcbS,vcbT);
00181 FEATUREVERBOSE2(2, " Done." << std::endl);
00182 FactorCollection &factorCollection = FactorCollection::Instance();
00183 m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
00184 UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
00185 << ": Factor for GIZA empty word does not exist.");
00186
00187 if (m_skipTargetPunctuation) {
00188 const std::string punctuation = ",;.:!?";
00189 for (size_t i=0; i<punctuation.size(); ++i) {
00190 const std::string punct = punctuation.substr(i,1);
00191 FactorCollection &factorCollection = FactorCollection::Instance();
00192 const Factor* punctFactor = factorCollection.AddFactor(punct,false);
00193 std::pair<std::set<const Factor*>::iterator,bool> inserted = m_punctuation.insert(punctFactor);
00194 }
00195 }
00196 }
00197
00198 void Model1Feature::EvaluateWithSourceContext(const InputType &input
00199 , const InputPath &inputPath
00200 , const TargetPhrase &targetPhrase
00201 , const StackVec *stackVec
00202 , ScoreComponentCollection &scoreBreakdown
00203 , ScoreComponentCollection *estimatedScores) const
00204 {
00205 const Sentence& sentence = static_cast<const Sentence&>(input);
00206 float score = 0.0;
00207 float norm = TransformScore(1+sentence.GetSize());
00208
00209 for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
00210 const Word &wordT = targetPhrase.GetWord(posT);
00211 if (m_skipTargetPunctuation) {
00212 std::set<const Factor*>::const_iterator foundPunctuation = m_punctuation.find(wordT[0]);
00213 if (foundPunctuation != m_punctuation.end()) {
00214 continue;
00215 }
00216 }
00217 if ( !wordT.IsNonTerminal() ) {
00218 float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]);
00219
00220
00221 bool foundInCache = false;
00222 {
00223 #ifdef WITH_THREADS
00224 boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00225 #endif
00226 boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> >::const_iterator sentenceCache = m_cache.find(&input);
00227 if (sentenceCache != m_cache.end()) {
00228 boost::unordered_map<const Factor*, float>::const_iterator cacheHit = sentenceCache->second.find(wordT[0]);
00229 if (cacheHit != sentenceCache->second.end()) {
00230 foundInCache = true;
00231 score += cacheHit->second;
00232 FEATUREVERBOSE(3, "Cached score( " << wordT << " ) = " << cacheHit->second << std::endl);
00233 }
00234 }
00235 }
00236
00237 if (!foundInCache) {
00238 for (size_t posS=(m_is_syntax?1:0); posS<(m_is_syntax?sentence.GetSize()-1:sentence.GetSize()); ++posS) {
00239 const Word &wordS = sentence.GetWord(posS);
00240 float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
00241 FEATUREVERBOSE(4, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
00242 thisWordProb += modelProb;
00243 }
00244 float thisWordScore = TransformScore(thisWordProb) - norm;
00245 FEATUREVERBOSE(3, "score( " << wordT << " ) = " << thisWordScore << std::endl);
00246 {
00247 #ifdef WITH_THREADS
00248
00249 boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00250 #endif
00251 m_cache[&input][wordT[0]] = thisWordScore;
00252 }
00253 score += thisWordScore;
00254 }
00255 }
00256 }
00257
00258 scoreBreakdown.PlusEquals(this, score);
00259 }
00260
00261 void Model1Feature::CleanUpAfterSentenceProcessing(const InputType& source)
00262 {
00263 #ifdef WITH_THREADS
00264
00265 boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00266 #endif
00267
00268 boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> >::iterator sentenceCache = m_cache.find(&source);
00269 if (sentenceCache != m_cache.end()) {
00270 sentenceCache->second.clear();
00271 m_cache.erase(sentenceCache);
00272 }
00273 }
00274
00275 }
00276