Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/ScoreComponentCollection.h Source File

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 /***********************************************************************
00003 Moses - factored phrase-based language decoder
00004 Copyright (C) 2006 University of Edinburgh
00005 
00006 This library is free software; you can redistribute it and/or
00007 modify it under the terms of the GNU Lesser General Public
00008 License as published by the Free Software Foundation; either
00009 version 2.1 of the License, or (at your option) any later version.
00010 
00011 This library is distributed in the hope that it will be useful,
00012 but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014 Lesser General Public License for more details.
00015 
00016 You should have received a copy of the GNU Lesser General Public
00017 License along with this library; if not, write to the Free Software
00018 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00019 ***********************************************************************/
00020 
00021 #ifndef moses_ScoreComponentCollection_h
00022 #define moses_ScoreComponentCollection_h
00023 
00024 #include <numeric>
00025 #include <sstream>
00026 
00027 #ifdef MPI_ENABLE
00028 #include <boost/serialization/access.hpp>
00029 #include <boost/serialization/split_member.hpp>
00030 #endif
00031 
00032 #include "moses/FF/FeatureFunction.h"
00033 #include "FeatureVector.h"
00034 #include "TypeDef.h"
00035 #include "Util.h"
00036 #include "util/exception.hh"
00037 
00038 namespace Moses
00039 {
00040 
00044 struct ScorePair {
00045   friend std::ostream& operator<<(std::ostream& os, const ScorePair& rhs);
00046 
00047   std::vector<float> denseScores;
00048   std::map<StringPiece, float> sparseScores;
00049 
00050   ScorePair() {
00051   }
00052   ScorePair(const std::vector<float> &other)
00053     :denseScores(other) {
00054   }
00055 
00056   void PlusEquals(const ScorePair &other);
00057   void PlusEquals(const StringPiece &key, float value);
00058 
00059   void PlusEquals(const std::vector<float> &other) {
00060     UTIL_THROW_IF2(denseScores.size() != other.size(), "Number of scores incorrect");
00061     std::transform(denseScores.begin(),
00062                    denseScores.end(),
00063                    other.begin(),
00064                    denseScores.begin(),
00065                    std::plus<float>());
00066   }
00067 };
00068 
00069 /*** An unweighted collection of scores for a translation or step in a translation.
00070  *
00071  * In the factored phrase-based models that are implemented by moses, there are a set of
00072  * scores that come from a variety of sources (translation probabilities, language model
00073  * probablilities, distortion probabilities, generation probabilities).  Furthermore, while
00074  * some of these scores may be 0, this number is fixed (and generally quite small, ie, less
00075  * than 15), for a given model.
00076  *
00077  * The values contained in ScoreComponentCollection objects are unweighted scores (log-probs).
00078  *
00079  * ScoreComponentCollection objects can be added and subtracted, which makes them appropriate
00080  * to be the datatype used to return the result of a score computations (in this case they will
00081  * have most values set to zero, except for the ones that are results of the indivudal computation
00082  * this will then be added into the "running total" in the Hypothesis.  In fact, for a score
00083  * to be tracked in the hypothesis (and thus to participate in the decoding process), a class
00084  * representing that score must extend the ScoreProducer abstract base class.  For an example
00085  * refer to the DistortionScoreProducer class.
00086  */
00087 class ScoreComponentCollection
00088 {
00089   friend std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs);
00090   friend void swap(ScoreComponentCollection &first, ScoreComponentCollection &second);
00091 
00092 private:
00093   FVector m_scores;
00094 
00095 public:
00096   // typedef std::pair<size_t,size_t> IndexPair;
00097 private:
00098   // typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
00099   // static  ScoreIndexMap s_scoreIndexes;
00100   static size_t s_denseVectorSize;
00101 public:
00102   // static IndexPair GetIndexes(const FeatureFunction* sp) {
00103   //   ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
00104   //   if (indexIter == s_scoreIndexes.end()) {
00105   //     std::stringstream strme;
00106   //     strme << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() <<
00107   //           " not registered with ScoreIndexMap" << std::endl;
00108   //     strme << "You must call ScoreComponentCollection.RegisterScoreProducer() " <<
00109   //           " for every FeatureFunction" << std::endl;
00110   //     UTIL_THROW2(strme.str());
00111   //   }
00112   //   return indexIter->second;
00113   // }
00114 
00115 public:
00116   static void ResetCounter() {
00117     s_denseVectorSize = 0;
00118   }
00119 
00121   ScoreComponentCollection();
00122 
00124   ScoreComponentCollection(const ScoreComponentCollection& rhs)
00125     : m_scores(rhs.m_scores) {
00126   }
00127 
00128   ScoreComponentCollection& operator=( const ScoreComponentCollection& rhs ) {
00129     m_scores = rhs.m_scores;
00130     return *this;
00131   }
00132 
00137   static void RegisterScoreProducer(FeatureFunction* scoreProducer);
00138 
00140   bool Load(const std::string& filename) {
00141     return m_scores.load(filename);
00142   }
00143 
00144   const FVector& GetScoresVector() const {
00145     return m_scores;
00146   }
00147 
00148   const std::valarray<FValue> &getCoreFeatures() const {
00149     return m_scores.getCoreFeatures();
00150   }
00151 
00152   size_t Size() const {
00153     return m_scores.size();
00154   }
00155 
00156   void Resize() {
00157     if (m_scores.coreSize() != s_denseVectorSize) {
00158       m_scores.resize(s_denseVectorSize);
00159     }
00160   }
00161 
00163   static FVector CreateFVector() {
00164     return FVector(s_denseVectorSize);
00165   }
00166 
00167   void SetToBinaryOf(const ScoreComponentCollection& rhs) {
00168     m_scores.setToBinaryOf(rhs.m_scores);
00169   }
00170 
00172   void ZeroAll() {
00173     m_scores.clear();
00174   }
00175 
00176   void MultiplyEquals(float scalar);
00177   void DivideEquals(float scalar);
00178   void CoreDivideEquals(float scalar);
00179   void DivideEquals(const ScoreComponentCollection& rhs);
00180   void MultiplyEquals(const ScoreComponentCollection& rhs);
00181   void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff);
00182   void MultiplyEquals(float core_r0, float sparse_r0);
00183   void MultiplyEquals(const FeatureFunction* sp, float scalar);
00184 
00185   size_t GetNumberWeights(const FeatureFunction* sp);
00186 
00187   void CoreAssign(const ScoreComponentCollection& rhs) {
00188     m_scores.coreAssign(rhs.m_scores);
00189   }
00190 
00192   void PlusEquals(const ScoreComponentCollection& rhs) {
00193     m_scores += rhs.m_scores;
00194   }
00195 
00196   // add only sparse features
00197   void SparsePlusEquals(const ScoreComponentCollection& rhs) {
00198     m_scores.sparsePlusEquals(rhs.m_scores);
00199   }
00200 
00201   // add only core features
00202   void CorePlusEquals(const ScoreComponentCollection& rhs) {
00203     m_scores.corePlusEquals(rhs.m_scores);
00204   }
00205 
00206   void PlusEquals(const FVector& scores) {
00207     m_scores += scores;
00208   }
00209 
00211   void MinusEquals(const ScoreComponentCollection& rhs) {
00212     m_scores -= rhs.m_scores;
00213   }
00214 
00215   //For features which have an unbounded number of components
00216   void MinusEquals(const FeatureFunction*sp, const std::string& name, float score) {
00217     FName fname(sp->GetScoreProducerDescription(),name);
00218     m_scores[fname] -= score;
00219   }
00220 
00221   //For features which have an unbounded number of components
00222   void SparseMinusEquals(const std::string& full_name, float score) {
00223     FName fname(full_name);
00224     m_scores[fname] -= score;
00225   }
00226 
00230   void
00231   PlusEquals(const FeatureFunction* sp,
00232              const ScoreComponentCollection& scores) {
00233     size_t i = sp->GetIndex();
00234     size_t stop = i + sp->GetNumScoreComponents();
00235     for (; i < stop; ++i) m_scores[i] += scores.m_scores[i];
00236   }
00237 
00241   void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores) {
00242     UTIL_THROW_IF2(scores.size() != sp->GetNumScoreComponents(),
00243                    "Number of scores is incorrect");
00244     size_t offset = sp->GetIndex();
00245     for (size_t i = 0; i < scores.size(); ++i) {
00246       m_scores[i + offset] += scores[i];
00247     }
00248   }
00249 
00250   void PlusEquals(const FeatureFunction* sp, float scores[]) {
00251     size_t numScores = sp->GetNumScoreComponents();
00252     size_t offset = sp->GetIndex();
00253     for (size_t i = 0; i < numScores; ++i) {
00254       m_scores[i + offset] += scores[i];
00255     }
00256   }
00257 
00261   void PlusEquals(const FeatureFunction* sp, float score) {
00262     UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
00263                    "Number of scores is incorrect");
00264     m_scores[sp->GetIndex()] += score;
00265   }
00266 
00267   //For features which have an unbounded number of components
00268   void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score) {
00269     FName fname(sp->GetScoreProducerDescription(),name);
00270     m_scores[fname] += score;
00271   }
00272 
00273   void PlusEquals(const FeatureFunction* sp, const ScorePair &scorePair);
00274 
00275   // Add score by index
00276   void PlusEquals(size_t index, float score) {
00277     m_scores[index] += score;
00278   }
00279 
00280   //For features which have an unbounded number of components
00281   void SparsePlusEquals(const std::string& full_name, float score) {
00282     FName fname(full_name);
00283     m_scores[fname] += score;
00284   }
00285 
00286   void SparsePlusEquals(const FName& fname, float score) {
00287     m_scores[fname] += score;
00288   }
00289 
00290   void Assign(const FeatureFunction* sp, const std::vector<float>& scores);
00291 
00295   void Assign(const FeatureFunction* sp, float score) {
00296 
00297     UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
00298                    "Feature function must must only contain 1 score");
00299     m_scores[sp->GetIndex()] = score;
00300   }
00301 
00302   // Assign score by index
00303   void Assign(size_t index, float score) {
00304     m_scores[index] = score;
00305   }
00306 
00307   void Assign(const FeatureFunction*sp, const StringPiece &name, float score) {
00308     FName fname(sp->GetScoreProducerDescription(),name);
00309     m_scores[fname] = score;
00310   }
00311 
00312 
00313   //Read sparse features from string
00314   void Assign(const FeatureFunction* sp, const std::string &line);
00315 
00316   // shortcut: setting the value directly using the feature name
00317   void Assign(const std::string name, float score) {
00318     FName fname(name);
00319     m_scores[fname] = score;
00320   }
00321 
00322   float InnerProduct(const ScoreComponentCollection& rhs) const {
00323     return m_scores.inner_product(rhs.m_scores);
00324   }
00325 
00326   float PartialInnerProduct(const FeatureFunction* sp, const std::vector<float>& rhs) const {
00327     std::vector<float> lhs = GetScoresForProducer(sp);
00328     UTIL_THROW_IF2(lhs.size() != rhs.size(),
00329                    "Number of weights must match number of scores");
00330     return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
00331   }
00332 
00334   std::vector<float> GetScoresForProducer(const FeatureFunction* sp) const {
00335     size_t components = sp->GetNumScoreComponents();
00336 
00337     std::vector<float> res(components);
00338     size_t offset = sp->GetIndex();
00339     for (size_t i = 0; i < res.size(); ++i) {
00340       res[i] = m_scores[i + offset];
00341     }
00342     return res;
00343   }
00344 
00346   FVector GetVectorForProducer(const FeatureFunction* sp) const;
00347 
00348   float GetSparseWeight(const FName& featureName) const {
00349     return m_scores[featureName];
00350   }
00351 
00352   void PrintCoreFeatures() {
00353     m_scores.printCoreFeatures();
00354   }
00355 
00356   void ThresholdScaling(float maxValue) {
00357     // find (smallest) factor for which all weights are <= maxValue
00358     // 0.1 / 0.14 = 0.714285714
00359     // 0.1 / 0.17 = 0.588235294
00360     m_scores.thresholdScale(maxValue);
00361   }
00362 
00363   void CapMax(float maxValue) {
00364     // cap all sparse features to maxValue
00365     m_scores.capMax(maxValue);
00366   }
00367 
00368   void CapMin(float minValue) {
00369     // cap all sparse features to minValue
00370     m_scores.capMin(minValue);
00371   }
00372 
00373   // std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
00374   //   IndexPair indexPair = GetIndexes(sp);
00375   //   return indexPair;
00376   // }
00377 
00380   float GetScoreForProducer(const FeatureFunction* sp) const {
00381     UTIL_THROW_IF2(sp->GetNumScoreComponents() != 1,
00382                    "Feature function must must only contain 1 score");
00383     return m_scores[sp->GetIndex()];
00384   }
00385 
00386   //For features which have an unbounded number of components
00387   float GetScoreForProducer
00388   (const FeatureFunction* sp, const std::string& name) const {
00389     FName fname(sp->GetScoreProducerDescription(),name);
00390     return m_scores[fname];
00391   }
00392 
00393   float GetWeightedScore() const;
00394 
00395   void ZeroDenseFeatures(const FeatureFunction* sp);
00396   void InvertDenseFeatures(const FeatureFunction* sp);
00397   void L1Normalise();
00398   float GetL1Norm() const;
00399   float GetL2Norm() const;
00400   float GetLInfNorm() const;
00401   size_t L1Regularize(float lambda);
00402   void L2Regularize(float lambda);
00403   size_t SparseL1Regularize(float lambda);
00404   void SparseL2Regularize(float lambda);
00405   void Save(const std::string& filename) const;
00406   void Save(std::ostream&, bool multiline=true) const;
00407 
00408   void IncrementSparseHopeFeatures() {
00409     m_scores.incrementSparseHopeFeatures();
00410   }
00411   void IncrementSparseFearFeatures() {
00412     m_scores.incrementSparseFearFeatures();
00413   }
00414   void PrintSparseHopeFeatureCounts(std::ofstream& out) {
00415     m_scores.printSparseHopeFeatureCounts(out);
00416   }
00417   void PrintSparseFearFeatureCounts(std::ofstream& out) {
00418     m_scores.printSparseFearFeatureCounts(out);
00419   }
00420   void PrintSparseHopeFeatureCounts() {
00421     m_scores.printSparseHopeFeatureCounts();
00422   }
00423   void PrintSparseFearFeatureCounts() {
00424     m_scores.printSparseFearFeatureCounts();
00425   }
00426   size_t PruneSparseFeatures(size_t threshold) {
00427     return m_scores.pruneSparseFeatures(threshold);
00428   }
00429   size_t PruneZeroWeightFeatures() {
00430     return m_scores.pruneZeroWeightFeatures();
00431   }
00432   void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) {
00433     m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts);
00434   }
00435   void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) {
00436     m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0);
00437   }
00438   void Merge(const ScoreComponentCollection &other) {
00439     m_scores.merge(other.m_scores);
00440   }
00441 
00442   void OutputAllFeatureScores(std::ostream &out, bool with_labels) const;
00443   void OutputFeatureScores(std::ostream& out, Moses::FeatureFunction const* ff,
00444                            std::string &lastName, bool with_labels) const;
00445 
00446 #ifdef MPI_ENABLE
00447 public:
00448   friend class boost::serialization::access;
00449 
00450 private:
00451   //serialization
00452   template<class Archive>
00453   void save(Archive &ar, const unsigned int version) const {
00454     ar << m_scores;
00455   }
00456 
00457   template<class Archive>
00458   void load(Archive &ar, const unsigned int version) {
00459     ar >> m_scores;
00460 
00461   }
00462 
00463   BOOST_SERIALIZATION_SPLIT_MEMBER()
00464 
00465 #endif
00466 
00467 };
00468 
00469 struct SCCPlus {
00470   ScoreComponentCollection operator()
00471   (const ScoreComponentCollection& lhs,
00472    const ScoreComponentCollection& rhs) {
00473     ScoreComponentCollection sum(lhs);
00474     sum.PlusEquals(rhs);
00475     return sum;
00476   }
00477 };
00478 
00479 inline void swap(ScoreComponentCollection &first, ScoreComponentCollection &second)
00480 {
00481   swap(first.m_scores, second.m_scores);
00482 }
00483 
00484 }
00485 #endif