00001
00002
00003
00004
00005
00006
00007
00008
00009 #include "FeatureStats.h"
00010
00011 #include <fstream>
00012 #include <cmath>
00013 #include <stdexcept>
00014
00015 #include <boost/functional/hash.hpp>
00016
00017 #include "util/murmur_hash.hh"
00018
00019 #include "Util.h"
00020
00021 using namespace std;
00022
00023 namespace
00024 {
00025 const int kAvailableSize = 8;
00026 }
00027
00028 namespace MosesTuning
00029 {
00030
00031
00032 SparseVector::name2id_t SparseVector::m_name_to_id;
00033 SparseVector::id2name_t SparseVector::m_id_to_name;
00034
00035 FeatureStatsType SparseVector::get(const string& name) const
00036 {
00037 name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
00038 if (name2id_iter == m_name_to_id.end()) return 0;
00039 size_t id = name2id_iter->second;
00040 return get(id);
00041 }
00042
00043 FeatureStatsType SparseVector::get(size_t id) const
00044 {
00045 fvector_t::const_iterator fvector_iter = m_fvector.find(id);
00046 if (fvector_iter == m_fvector.end()) return 0;
00047 return fvector_iter->second;
00048 }
00049
00050 void SparseVector::set(const string& name, FeatureStatsType value)
00051 {
00052 name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
00053 size_t id = 0;
00054 if (name2id_iter == m_name_to_id.end()) {
00055 id = m_id_to_name.size();
00056 m_id_to_name.push_back(name);
00057 m_name_to_id[name] = id;
00058 } else {
00059 id = name2id_iter->second;
00060 }
00061 m_fvector[id] = value;
00062 }
00063
00064 void SparseVector::set(size_t id, FeatureStatsType value)
00065 {
00066 assert(m_id_to_name.size() > id);
00067 m_fvector[id] = value;
00068 }
00069
00070 void SparseVector::write(ostream& out, const string& sep) const
00071 {
00072 for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
00073 if (abs(i->second) < 0.00001) continue;
00074 string name = m_id_to_name[i->first];
00075 out << name << sep << i->second << " ";
00076 }
00077 }
00078
00079 void SparseVector::clear()
00080 {
00081 m_fvector.clear();
00082 }
00083
00084 void SparseVector::load(const string& file)
00085 {
00086 ifstream in(file.c_str());
00087 if (!in) {
00088 throw runtime_error("Failed to open sparse weights file: " + file);
00089 }
00090 string line;
00091 while(getline(in,line)) {
00092 if (line[0] == '#') continue;
00093 istringstream linestream(line);
00094 string name;
00095 float value;
00096 linestream >> name;
00097 linestream >> value;
00098 set(name,value);
00099 }
00100 }
00101
00102 SparseVector& SparseVector::operator+=(const SparseVector& rhs)
00103 {
00104
00105 for (fvector_t::const_iterator i = rhs.m_fvector.begin();
00106 i != rhs.m_fvector.end(); ++i) {
00107 m_fvector[i->first] = get(i->first) + (i->second);
00108 }
00109 return *this;
00110 }
00111
00112 SparseVector& SparseVector::operator-=(const SparseVector& rhs)
00113 {
00114
00115 for (fvector_t::const_iterator i = rhs.m_fvector.begin();
00116 i != rhs.m_fvector.end(); ++i) {
00117 m_fvector[i->first] = get(i->first) - (i->second);
00118 }
00119 return *this;
00120 }
00121
00122 FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
00123 {
00124 FeatureStatsType product = 0.0;
00125 for (fvector_t::const_iterator i = m_fvector.begin();
00126 i != m_fvector.end(); ++i) {
00127 product += ((i->second) * (rhs.get(i->first)));
00128 }
00129 return product;
00130 }
00131
00132 SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
00133 {
00134 SparseVector res(lhs);
00135 res -= rhs;
00136 return res;
00137 }
00138
00139 FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
00140 {
00141 if (lhs.size() >= rhs.size()) {
00142 return rhs.inner_product(lhs);
00143 } else {
00144 return lhs.inner_product(rhs);
00145 }
00146 }
00147
00148 std::vector<std::size_t> SparseVector::feats() const
00149 {
00150 std::vector<std::size_t> toRet;
00151 for(fvector_t::const_iterator iter = m_fvector.begin();
00152 iter!=m_fvector.end();
00153 iter++) {
00154 toRet.push_back(iter->first);
00155 }
00156 return toRet;
00157 }
00158
00159 std::size_t SparseVector::encode(const std::string& name)
00160 {
00161 name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
00162 size_t id = 0;
00163 if (name2id_iter == m_name_to_id.end()) {
00164 id = m_id_to_name.size();
00165 m_id_to_name.push_back(name);
00166 m_name_to_id[name] = id;
00167 } else {
00168 id = name2id_iter->second;
00169 }
00170 return id;
00171 }
00172
00173 std::string SparseVector::decode(std::size_t id)
00174 {
00175 return m_id_to_name[id];
00176 }
00177
00178 bool operator==(SparseVector const& item1, SparseVector const& item2)
00179 {
00180 return item1.m_fvector==item2.m_fvector;
00181 }
00182
00183
00184 std::size_t hash_value(SparseVector const& item)
00185 {
00186 size_t seed = 0;
00187 for (SparseVector::fvector_t::const_iterator i = item.m_fvector.begin(); i != item.m_fvector.end(); ++i) {
00188 seed = util::MurmurHashNative(&(i->first), sizeof(i->first), seed);
00189 seed = util::MurmurHashNative(&(i->second), sizeof(i->second), seed);
00190 }
00191 return seed;
00192 }
00193
00194
00195 FeatureStats::FeatureStats()
00196 : m_available_size(kAvailableSize), m_entries(0),
00197 m_array(new FeatureStatsType[m_available_size]) {}
00198
00199 FeatureStats::FeatureStats(const size_t size)
00200 : m_available_size(size), m_entries(size),
00201 m_array(new FeatureStatsType[m_available_size])
00202 {
00203 memset(m_array, 0, GetArraySizeWithBytes());
00204 }
00205
00206 FeatureStats::~FeatureStats()
00207 {
00208 delete [] m_array;
00209 }
00210
00211 void FeatureStats::Copy(const FeatureStats &stats)
00212 {
00213 m_available_size = stats.available();
00214 m_entries = stats.size();
00215 m_array = new FeatureStatsType[m_available_size];
00216 memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
00217 m_map = stats.getSparse();
00218 }
00219
00220 FeatureStats::FeatureStats(const FeatureStats &stats)
00221 {
00222 Copy(stats);
00223 }
00224
00225 FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
00226 {
00227 delete [] m_array;
00228 Copy(stats);
00229 return *this;
00230 }
00231
00232 void FeatureStats::expand()
00233 {
00234 m_available_size *= 2;
00235 featstats_t t_ = new FeatureStatsType[m_available_size];
00236 memcpy(t_, m_array, GetArraySizeWithBytes());
00237 delete [] m_array;
00238 m_array = t_;
00239 }
00240
00241 void FeatureStats::add(FeatureStatsType v)
00242 {
00243 if (isfull()) expand();
00244 m_array[m_entries++]=v;
00245 }
00246
00247 void FeatureStats::addSparse(const string& name, FeatureStatsType v)
00248 {
00249 m_map.set(name,v);
00250 }
00251
00252 void FeatureStats::set(string &theString, const SparseVector& sparseWeights )
00253 {
00254 string substring, stringBuf;
00255 reset();
00256
00257 while (!theString.empty()) {
00258 getNextPound(theString, substring);
00259
00260 if (substring.find("=") == string::npos) {
00261 add(ConvertStringToFeatureStatsType(substring));
00262 }
00263
00264 else {
00265 size_t separator = substring.find_last_of("=");
00266 addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
00267 }
00268 }
00269
00270 if (sparseWeights.size()) {
00271
00272 FeatureStatsType merged = inner_product(sparseWeights, m_map);
00273 add(merged);
00274
00275
00276
00277
00278
00279
00280
00281 m_map.clear();
00282 }
00283
00284
00285
00286
00287
00288
00289 }
00290
00291 void FeatureStats::loadbin(istream* is)
00292 {
00293 is->read(reinterpret_cast<char*>(m_array),
00294 static_cast<streamsize>(GetArraySizeWithBytes()));
00295 }
00296
00297 void FeatureStats::loadtxt(istream* is, const SparseVector& sparseWeights)
00298 {
00299 string line;
00300 getline(*is, line);
00301 set(line, sparseWeights);
00302 }
00303
00304 void FeatureStats::savetxt(const string &file)
00305 {
00306 ofstream ofs(file.c_str(), ios::out);
00307 ostream* os = &ofs;
00308 savetxt(os);
00309 }
00310
00311 void FeatureStats::savetxt(ostream* os)
00312 {
00313 *os << *this;
00314 }
00315
00316 void FeatureStats::savetxt()
00317 {
00318 savetxt(&cout);
00319 }
00320
00321 void FeatureStats::savebin(ostream* os)
00322 {
00323 os->write(reinterpret_cast<char*>(m_array),
00324 static_cast<streamsize>(GetArraySizeWithBytes()));
00325 }
00326
00327 ostream& operator<<(ostream& o, const FeatureStats& e)
00328 {
00329
00330 for (size_t i=0; i< e.size(); i++) {
00331 o << e.get(i) << " ";
00332 }
00333
00334 e.getSparse().write(o,"");
00335
00336 return o;
00337 }
00338
00339 bool operator==(const FeatureStats& f1, const FeatureStats& f2)
00340 {
00341 size_t size = f1.size();
00342
00343 if (size != f2.size())
00344 return false;
00345
00346 for (size_t k=0; k < size; k++) {
00347 if (f1.get(k) != f2.get(k))
00348 return false;
00349 }
00350
00351 return true;
00352 }
00353
00354 }