00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #pragma once
00021
00022 #ifndef FEATUREVECTOR_H
00023 #define FEATUREVECTOR_H
00024
00025 #include <iostream>
00026 #include <map>
00027 #include <sstream>
00028 #include <string>
00029 #include <valarray>
00030 #include <vector>
00031
00032 #include <boost/functional/hash.hpp>
00033 #include <boost/unordered_map.hpp>
00034
00035 #ifdef MPI_ENABLE
00036 #include <boost/serialization/access.hpp>
00037 #include <boost/serialization/split_member.hpp>
00038 #include <boost/serialization/string.hpp>
00039 #include <boost/serialization/vector.hpp>
00040 #include <boost/serialization/valarray.hpp>
00041 #endif
00042
00043 #ifdef WITH_THREADS
00044 #include <boost/thread/shared_mutex.hpp>
00045 #endif
00046
00047 #include "util/exception.hh"
00048 #include "util/string_piece.hh"
00049
00050 namespace Moses
00051 {
00052
00053 typedef float FValue;
00054
00058 struct FName {
00059
00060 static const std::string SEP;
00061
00062 typedef boost::unordered_map<std::string,size_t> Name2Id;
00063 typedef boost::unordered_map<size_t,size_t> Id2Count;
00064
00065 static Name2Id name2id;
00066 static std::vector<std::string> id2name;
00067 static Id2Count id2hopeCount;
00068 static Id2Count id2fearCount;
00069
00070
00071
00072
00073 FName(const StringPiece &root, const StringPiece &name) {
00074 std::string assembled(root.data(), root.size());
00075 assembled += SEP;
00076 assembled.append(name.data(), name.size());
00077 init(assembled);
00078 }
00079 explicit FName(const StringPiece &name) {
00080 init(name);
00081 }
00082
00083 const std::string& name() const;
00084
00085
00086 size_t hash() const;
00087
00088 bool operator==(const FName& rhs) const ;
00089 bool operator!=(const FName& rhs) const ;
00090
00091 static size_t getId(const std::string& name);
00092 static size_t getHopeIdCount(const std::string& name);
00093 static size_t getFearIdCount(const std::string& name);
00094 static void incrementHopeId(const std::string& name);
00095 static void incrementFearId(const std::string& name);
00096 static void eraseId(size_t id);
00097
00098 private:
00099 void init(const StringPiece& name);
00100 size_t m_id;
00101 #ifdef WITH_THREADS
00102
00103 static boost::shared_mutex m_idLock;
00104 #endif
00105 };
00106
00107 std::ostream& operator<<(std::ostream& out,const FName& name);
00108
00109 struct FNameEquals {
00110 inline bool operator() (const FName& lhs, const FName& rhs) const {
00111 return (lhs == rhs);
00112 }
00113 };
00114
00115 struct FNameHash
00116 : std::unary_function<FName, std::size_t> {
00117 std::size_t operator()(const FName& x) const {
00118 return x.hash();
00119 }
00120 };
00121
00122 class ProxyFVector;
00123
00127 class FVector
00128 {
00129 public:
00131 FVector(size_t coreFeatures = 0);
00132
00133 FVector& operator=( const FVector& rhs ) {
00134 m_features = rhs.m_features;
00135 m_coreFeatures = rhs.m_coreFeatures;
00136 return *this;
00137 }
00138
00139
00140
00141
00142 void resize(size_t newsize);
00143
00144 typedef boost::unordered_map<FName,FValue,FNameHash, FNameEquals> FNVmap;
00146 typedef FNVmap::iterator iterator;
00147 typedef FNVmap::const_iterator const_iterator;
00148 iterator begin() {
00149 return m_features.begin();
00150 }
00151 iterator end() {
00152 return m_features.end();
00153 }
00154 const_iterator cbegin() const {
00155 return m_features.cbegin();
00156 }
00157 const_iterator cend() const {
00158 return m_features.cend();
00159 }
00160
00161 bool hasNonDefaultValue(FName name) const {
00162 return m_features.find(name) != m_features.end();
00163 }
00164 void clear();
00165
00166
00168 bool load(const std::string& filename);
00169 void save(const std::string& filename) const;
00170 void write(std::ostream& out, const std::string& sep=" ", const std::string& linesep="\n") const ;
00171
00173 ProxyFVector operator[](const FName& name);
00174 FValue& operator[](size_t index);
00175 FValue operator[](const FName& name) const;
00176 FValue operator[](size_t index) const;
00177
00179 size_t size() const {
00180 return m_features.size() + m_coreFeatures.size();
00181 }
00182
00183 size_t coreSize() const {
00184 return m_coreFeatures.size();
00185 }
00186
00187 const std::valarray<FValue> &getCoreFeatures() const {
00188 return m_coreFeatures;
00189 }
00190
00192 bool operator== (const FVector& rhs) const;
00193 bool operator!= (const FVector& rhs) const;
00194
00195 FValue inner_product(const FVector& rhs) const;
00196
00197 friend class ProxyFVector;
00198
00200
00201
00202 FVector& operator+= (const FVector& rhs);
00203 FVector& operator-= (const FVector& rhs);
00204 FVector& operator*= (const FVector& rhs);
00205 FVector& operator/= (const FVector& rhs);
00206
00207 FVector& operator*= (const FValue& rhs);
00208 FVector& operator/= (const FValue& rhs);
00209
00210 FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff);
00211 FVector& multiplyEquals(float core_r0, float sparse_r0);
00212
00213 FVector& max_equals(const FVector& rhs);
00214
00216 FValue l1norm() const;
00217 FValue l1norm_coreFeatures() const;
00218 FValue l2norm() const;
00219 FValue linfnorm() const;
00220 size_t l1regularize(float lambda);
00221 void l2regularize(float lambda);
00222 size_t sparseL1regularize(float lambda);
00223 void sparseL2regularize(float lambda);
00224 FValue sum() const;
00225
00227 std::ostream& print(std::ostream& out) const;
00228
00230 void printCoreFeatures();
00231
00232 void thresholdScale(float maxValue );
00233
00234 void capMax(FValue maxValue);
00235 void capMin(FValue minValue);
00236
00237 void sparsePlusEquals(const FVector& rhs);
00238 void corePlusEquals(const FVector& rhs);
00239 void coreAssign(const FVector& rhs);
00240
00241 void incrementSparseHopeFeatures();
00242 void incrementSparseFearFeatures();
00243 void printSparseHopeFeatureCounts(std::ofstream& out);
00244 void printSparseFearFeatureCounts(std::ofstream& out);
00245 void printSparseHopeFeatureCounts();
00246 void printSparseFearFeatureCounts();
00247 size_t pruneSparseFeatures(size_t threshold);
00248 size_t pruneZeroWeightFeatures();
00249 void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts);
00250 void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0);
00251
00252
00253 void setToBinaryOf(const FVector& rhs);
00254
00255
00256 FVector& coreDivideEquals(float scalar);
00257
00258
00259 FVector& divideEquals(const FVector& rhs);
00260
00261 void merge(const FVector &other);
00262
00263 #ifdef MPI_ENABLE
00264 friend class boost::serialization::access;
00265 #endif
00266
00267 private:
00268 friend void swap(FVector &first, FVector &second);
00269
00271 const FValue& get(const FName& name) const;
00272 FValue getBackoff(const FName& name, float backoff) const;
00273 void set(const FName& name, const FValue& value);
00274
00275 FNVmap m_features;
00276 std::valarray<FValue> m_coreFeatures;
00277
00278 #ifdef MPI_ENABLE
00279
00280 template<class Archive>
00281 void save(Archive &ar, const unsigned int version) const {
00282 std::vector<std::string> names;
00283 std::vector<FValue> values;
00284 for (const_iterator i = cbegin(); i != cend(); ++i) {
00285 std::ostringstream ostr;
00286 ostr << i->first;
00287 names.push_back(ostr.str());
00288 values.push_back(i->second);
00289 }
00290 ar << names;
00291 ar << values;
00292 ar << m_coreFeatures;
00293 }
00294
00295 template<class Archive>
00296 void load(Archive &ar, const unsigned int version) {
00297 clear();
00298 std::vector<std::string> names;
00299 std::vector<FValue> values;
00300 ar >> names;
00301 ar >> values;
00302 ar >> m_coreFeatures;
00303 UTIL_THROW_IF2(names.size() != values.size(), "Error");
00304 for (size_t i = 0; i < names.size(); ++i) {
00305 set(FName(names[i]), values[i]);
00306 }
00307 }
00308
00309 BOOST_SERIALIZATION_SPLIT_MEMBER()
00310
00311 #endif
00312
00313 };
00314
00315 inline void swap(FVector &first, FVector &second)
00316 {
00317 swap(first.m_features, second.m_features);
00318 swap(first.m_coreFeatures, second.m_coreFeatures);
00319 }
00320
00321 std::ostream& operator<<( std::ostream& out, const FVector& fv);
00322
00323 const FVector operator+(const FVector& lhs, const FVector& rhs);
00324 const FVector operator-(const FVector& lhs, const FVector& rhs);
00325 const FVector operator*(const FVector& lhs, const FVector& rhs);
00326 const FVector operator/(const FVector& lhs, const FVector& rhs);
00327
00328
00329 const FVector operator*(const FVector& lhs, const FValue& rhs);
00330 const FVector operator/(const FVector& lhs, const FValue& rhs);
00331
00332 const FVector fvmax(const FVector& lhs, const FVector& rhs);
00333
00334 FValue inner_product(const FVector& lhs, const FVector& rhs);
00335
00336 struct FVectorPlus {
00337 FVector operator()(const FVector& lhs, const FVector& rhs) const {
00338 return lhs + rhs;
00339 }
00340 };
00341
00346 class ProxyFVector
00347 {
00348 public:
00349 ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {}
00350 ProxyFVector &operator=(const FValue& value) {
00351
00352
00353
00354 m_fv->set(m_name,value);
00355 return *this;
00356
00357 }
00358
00359 operator FValue() {
00360
00361
00362 return m_fv->get(m_name);
00363 }
00364
00365
00366
00367
00368
00369 FValue operator++() {
00370 return ++m_fv->m_features[m_name];
00371 }
00372
00373 FValue operator +=(FValue lhs) {
00374 return (m_fv->m_features[m_name] += lhs);
00375 }
00376
00377 FValue operator -=(FValue lhs) {
00378 return (m_fv->m_features[m_name] -= lhs);
00379 }
00380
00381 private:
00382 FVector* m_fv;
00383 const FName& m_name;
00384
00385 };
00386
00387 }
00388
00389 #endif