Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/FF/Model1Feature.cpp Source File

00001 #include "Model1Feature.h"
00002 #include "moses/StaticData.h"
00003 #include "moses/InputFileStream.h"
00004 #include "moses/ScoreComponentCollection.h"
00005 #include "moses/FactorCollection.h"
00006 
00007 
00008 using namespace std;
00009 
00010 namespace Moses
00011 {
00012 
00013 const std::string Model1Vocabulary::GIZANULL = "GIZANULL";
00014 
00015 Model1Vocabulary::Model1Vocabulary()
00016 {
00017   FactorCollection &factorCollection = FactorCollection::Instance();
00018   m_NULL = factorCollection.AddFactor(GIZANULL,false);
00019   Store(m_NULL,0);
00020 }
00021 
00022 bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
00023 {
00024   boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
00025   if ( iter != m_lookup.end() ) {
00026     return false;
00027   }
00028   m_lookup[ word ] = id;
00029   if ( m_vocab.size() <= id ) {
00030     m_vocab.resize(id+1);
00031   }
00032   m_vocab[id] = word;
00033   return true;
00034 }
00035 
00036 unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
00037 {
00038   boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
00039 
00040   if ( iter != m_lookup.end() ) {
00041     return iter->second;
00042   }
00043 
00044   unsigned id = m_vocab.size();
00045   m_vocab.push_back( word );
00046   m_lookup[ word ] = id;
00047   return id;
00048 }
00049 
00050 unsigned Model1Vocabulary::GetWordID(const Factor* word) const
00051 {
00052   boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
00053   if ( iter == m_lookup.end() ) {
00054     return INVALID_ID;
00055   }
00056   return iter->second;
00057 }
00058 
00059 const Factor* Model1Vocabulary::GetWord(unsigned id) const
00060 {
00061   if (id >= m_vocab.size()) {
00062     return NULL;
00063   }
00064   return m_vocab[ id ];
00065 }
00066 
00067 void Model1Vocabulary::Load(const std::string& fileName)
00068 {
00069   InputFileStream inFile(fileName);
00070   FactorCollection &factorCollection = FactorCollection::Instance();
00071   std::string line;
00072 
00073   unsigned i = 0;
00074   if ( getline(inFile, line) ) { // first line of MGIZA vocabulary files seems to be special : "1       UNK     0"  -- skip if it's this
00075     ++i;
00076     std::vector<std::string> tokens = Tokenize(line);
00077     UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00078     unsigned id = atoll( tokens[0].c_str() );
00079     if (! ( (id == 1) && (tokens[1] == "UNK") )) {
00080       const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
00081       bool stored = Store(factor, id);
00082       UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
00083     }
00084   }
00085   while ( getline(inFile, line) ) {
00086     ++i;
00087     std::vector<std::string> tokens = Tokenize(line);
00088     UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00089     unsigned id = atoll( tokens[0].c_str() );
00090     const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
00091     bool stored = Store(factor, id);
00092     UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
00093   }
00094   inFile.Close();
00095 }
00096 
00097 
00098 void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT)
00099 {
00100   InputFileStream inFile(fileName);
00101   std::string line;
00102 
00103   unsigned i = 0;
00104   while ( getline(inFile, line) ) {
00105     ++i;
00106     std::vector<std::string> tokens = Tokenize(line);
00107     UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
00108     unsigned idS = atoll( tokens[0].c_str() );
00109     unsigned idT = atoll( tokens[1].c_str() );
00110     const Factor* wordS = vcbS.GetWord(idS);
00111     const Factor* wordT = vcbT.GetWord(idT);
00112     float prob = std::atof( tokens[2].c_str() );
00113     if ( (wordS != NULL) && (wordT != NULL) ) {
00114       m_ltable[ wordS ][ wordT ] = prob;
00115     }
00116     UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary."); // TODO: can we assume that the vocabulary is know and filter the model on loading? Then remove this line.
00117   }
00118   inFile.Close();
00119 }
00120 
00121 // p( wordT | wordS )
00122 float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
00123 {
00124   float prob = m_floor;
00125 
00126   boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );
00127 
00128   if ( iter1 != m_ltable.end() ) {
00129     boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
00130     if ( iter2 != iter1->second.end() ) {
00131       prob = iter2->second;
00132       if ( prob < m_floor ) {
00133         prob = m_floor;
00134       }
00135     }
00136   }
00137   return prob;
00138 }
00139 
00140 
00141 Model1Feature::Model1Feature(const std::string &line)
00142   : StatelessFeatureFunction(1, line)
00143   , m_skipTargetPunctuation(false)
00144   , m_is_syntax(false)
00145 {
00146   VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
00147   ReadParameters();
00148   VERBOSE(1, " Done.");
00149 }
00150 
00151 void Model1Feature::SetParameter(const std::string& key, const std::string& value)
00152 {
00153   if (key == "path") {
00154     m_fileNameModel1 = value;
00155   } else if (key == "source-vocabulary") {
00156     m_fileNameVcbS = value;
00157   } else if (key == "target-vocabulary") {
00158     m_fileNameVcbT = value;
00159   } else if (key == "skip-target-punctuation") {
00160     m_skipTargetPunctuation = Scan<bool>(value);
00161   } else {
00162     StatelessFeatureFunction::SetParameter(key, value);
00163   }
00164 }
00165 
00166 void Model1Feature::Load(AllOptions::ptr const& opts)
00167 {
00168   m_options = opts;
00169   m_is_syntax = is_syntax(opts->search.algo);
00170 
00171   FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
00172   Model1Vocabulary vcbS;
00173   vcbS.Load(m_fileNameVcbS);
00174   FEATUREVERBOSE2(2, " Done." << std::endl);
00175   FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ...");
00176   Model1Vocabulary vcbT;
00177   vcbT.Load(m_fileNameVcbT);
00178   FEATUREVERBOSE2(2, " Done." << std::endl);
00179   FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ...");
00180   m_model1.Load(m_fileNameModel1,vcbS,vcbT);
00181   FEATUREVERBOSE2(2, " Done." << std::endl);
00182   FactorCollection &factorCollection = FactorCollection::Instance();
00183   m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
00184   UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
00185                  << ": Factor for GIZA empty word does not exist.");
00186 
00187   if (m_skipTargetPunctuation) {
00188     const std::string punctuation = ",;.:!?";
00189     for (size_t i=0; i<punctuation.size(); ++i) {
00190       const std::string punct = punctuation.substr(i,1);
00191       FactorCollection &factorCollection = FactorCollection::Instance();
00192       const Factor* punctFactor = factorCollection.AddFactor(punct,false);
00193       std::pair<std::set<const Factor*>::iterator,bool> inserted = m_punctuation.insert(punctFactor);
00194     }
00195   }
00196 }
00197 
00198 void Model1Feature::EvaluateWithSourceContext(const InputType &input
00199     , const InputPath &inputPath
00200     , const TargetPhrase &targetPhrase
00201     , const StackVec *stackVec
00202     , ScoreComponentCollection &scoreBreakdown
00203     , ScoreComponentCollection *estimatedScores) const
00204 {
00205   const Sentence& sentence = static_cast<const Sentence&>(input);
00206   float score = 0.0;
00207   float norm = TransformScore(1+sentence.GetSize());
00208 
00209   for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
00210     const Word &wordT = targetPhrase.GetWord(posT);
00211     if (m_skipTargetPunctuation) {
00212       std::set<const Factor*>::const_iterator foundPunctuation = m_punctuation.find(wordT[0]);
00213       if (foundPunctuation != m_punctuation.end()) {
00214         continue;
00215       }
00216     }
00217     if ( !wordT.IsNonTerminal() ) {
00218       float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
00219 
00220       // cache lookup
00221       bool foundInCache = false;
00222       {
00223 #ifdef WITH_THREADS
00224         boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00225 #endif
00226         boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> >::const_iterator sentenceCache = m_cache.find(&input);
00227         if (sentenceCache != m_cache.end()) {
00228           boost::unordered_map<const Factor*, float>::const_iterator cacheHit = sentenceCache->second.find(wordT[0]);
00229           if (cacheHit != sentenceCache->second.end()) {
00230             foundInCache = true;
00231             score += cacheHit->second;
00232             FEATUREVERBOSE(3, "Cached score( " << wordT << " ) = " << cacheHit->second << std::endl);
00233           }
00234         }
00235       }
00236 
00237       if (!foundInCache) {
00238         for (size_t posS=(m_is_syntax?1:0); posS<(m_is_syntax?sentence.GetSize()-1:sentence.GetSize()); ++posS) { // ignore <s> and </s>
00239           const Word &wordS = sentence.GetWord(posS);
00240           float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
00241           FEATUREVERBOSE(4, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
00242           thisWordProb += modelProb;
00243         }
00244         float thisWordScore = TransformScore(thisWordProb) - norm;
00245         FEATUREVERBOSE(3, "score( " << wordT << " ) = " << thisWordScore << std::endl);
00246         {
00247 #ifdef WITH_THREADS
00248           // need to update cache; write lock
00249           boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00250 #endif
00251           m_cache[&input][wordT[0]] = thisWordScore;
00252         }
00253         score += thisWordScore;
00254       }
00255     }
00256   }
00257 
00258   scoreBreakdown.PlusEquals(this, score);
00259 }
00260 
00261 void Model1Feature::CleanUpAfterSentenceProcessing(const InputType& source)
00262 {
00263 #ifdef WITH_THREADS
00264   // need to update cache; write lock
00265   boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00266 #endif
00267   // clear cache
00268   boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> >::iterator sentenceCache = m_cache.find(&source);
00269   if (sentenceCache != m_cache.end()) {
00270     sentenceCache->second.clear();
00271     m_cache.erase(sentenceCache);
00272   }
00273 }
00274 
00275 }
00276