00001
00002 #include "ug_sampling_bias.h"
00003 #include <iostream>
00004 #include <boost/foreach.hpp>
00005 #include "moses/Util.h"
00006 #ifndef NO_MOSES
00007 #include "moses/Timer.h"
00008 #endif
00009
00010
00011 #include "ug_http_client.h"
00012
00013
00014 namespace sapt
00015 {
00016 using tpt::id_type;
00017
00018 std::string
00019 query_bias_server(std::string const& server,
00020 std::string const& context,
00021 std::ostream* log)
00022 {
00023 std::string query = server + Moses::uri_encode(context);
00024 boost::asio::io_service io_service;
00025 Moses::http_client c(io_service, query, log);
00026 io_service.run();
00027
00028 if (log)
00029 {
00030 std::string response = c.content();
00031 *log << "SERVER RESPONSE: " << response << std::endl;
00032 }
00033 if (c.content().size() == 0)
00034 {
00035 if (log) *log << "BIAS SERVER ERROR: " << c.error_msg() << std::endl;
00036 }
00037 return c.content();
00038 }
00039
00040 SamplingBias::
00041 SamplingBias(std::vector<id_type> const* sid2doc)
00042 : m_sid2docid(sid2doc)
00043 { }
00044
00045 int
00046 SamplingBias::
00047 GetClass(id_type const idx) const
00048 {
00049 return m_sid2docid ? m_sid2docid->at(idx) : -1;
00050 }
00051
00052 DocumentBias::
00053 DocumentBias(std::vector<id_type> const& sid2doc,
00054 std::map<std::string,id_type> const& docname2docid,
00055 std::string const& server_url,
00056 std::string const& text,
00057 std::ostream* _log)
00058 : SamplingBias(&sid2doc)
00059 {
00060 this->log = _log;
00061 #ifndef NO_MOSES
00062 Moses::Timer timer;
00063 if (_log) timer.start(NULL);
00064 #endif
00065 std::string json = query_bias_server(server_url, text, _log);
00066
00067 init_from_json(json, docname2docid, log);
00068 #ifndef NO_MOSES
00069 if (_log) *_log << "Bias query took " << timer << " seconds." << std::endl;
00070 #endif
00071 }
00072
00073 DocumentBias::
00074 DocumentBias(std::vector<id_type> const& sid2doc,
00075 std::map<std::string,id_type> const& docname2docid,
00076 std::map<std::string, float> const& context_weights,
00077 std::ostream* _log)
00078 : SamplingBias(&sid2doc)
00079 {
00080 this->log = _log;
00081 init(context_weights, docname2docid);
00082 }
00083
00084 SPTR<std::map<std::string, float> const>
00085 SamplingBias::
00086 getBiasMap() {
00087 return m_bias_map;
00088 }
00089
00090 const std::map<id_type, float>&
00091 DocumentBias::
00092 GetDocumentBiasMap() const {
00093 return m_bias;
00094 }
00095
00096 void
00097 DocumentBias::
00098 init_from_json
00099 ( std::string const& json, std::map<std::string,id_type> const& docname2docid,
00100 std::ostream* log)
00101 {
00102
00103
00104 std::string d; float total = 0; std::map<std::string,float> bias;
00105 size_t i = 0; while (i < json.size() && json[i] != '"') ++i;
00106 while (++i < json.size())
00107 {
00108 size_t k = i; while (i < json.size() && json[i] != '"') ++i;
00109 if (i >= json.size()) break;
00110 float& f = bias[json.substr(k,i-k)];
00111 while (++i < json.size() && json[i] != ':');
00112 k = ++i;
00113 while (++i < json.size() && json[i] != ',' && json[i] != '}');
00114 total += (f = atof(json.substr(k, i-k).c_str()));
00115 k = ++i; while (i < json.size() && json[i] != '"') ++i;
00116 }
00117
00118 typedef std::pair<std::string const,float> item;
00119 if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } }
00120 init(bias, docname2docid);
00121
00122 }
00123
00124 void
00125 DocumentBias::
00126 init(std::map<std::string,float> const& biasmap,
00127 std::map<std::string,id_type> const& docname2docid)
00128 {
00129 typedef std::map<std::string, float>::value_type bias_record;
00130 float total = 0;
00131 m_bias_map.reset(new std::map<std::string,float>(biasmap));
00132 BOOST_FOREACH(bias_record const& b, biasmap)
00133 {
00134 std::map<std::string, id_type>::const_iterator m;
00135 m = docname2docid.find(b.first);
00136 if (m != docname2docid.end())
00137 total += (m_bias[m->second] = b.second);
00138 }
00139 if (total)
00140 {
00141 typedef std::map<id_type, float>::value_type item;
00142 BOOST_FOREACH(item& i, m_bias) i.second /= total;
00143 }
00144
00145 if (log)
00146 {
00147 BOOST_FOREACH(bias_record const& b, biasmap)
00148 {
00149 std::map<std::string, id_type>::const_iterator m;
00150 m = docname2docid.find(b.first);
00151 if (m != docname2docid.end())
00152 *log << "BIAS " << b.first << " " << m_bias[m->second]
00153 << std::endl;
00154 else
00155 *log << "WARNING: bias reported for unknown document "
00156 << b.first << std::endl;
00157 }
00158 }
00159 }
00160
00161 float
00162 DocumentBias::
00163 operator[](id_type const idx) const
00164 {
00165 std::map<id_type, float>::const_iterator m;
00166 m = m_bias.find((*m_sid2docid)[idx]);
00167 return m != m_bias.end() ? m->second : 0;
00168 }
00169
00170 size_t
00171 DocumentBias::
00172 size() const
00173 {
00174 return m_sid2docid->size();
00175 }
00176
00177
00178
00179 SentenceBias::
00180 SentenceBias(std::vector<float> const& bias,
00181 std::vector<id_type> const* sid2doc)
00182 : SamplingBias(sid2doc)
00183 , m_bias(bias)
00184 { }
00185
00186 SentenceBias::
00187 SentenceBias(size_t const s, float const f,
00188 std::vector<id_type> const* sid2doc)
00189
00190 : SamplingBias(sid2doc)
00191 , m_bias(s,f)
00192 { }
00193
00194 float&
00195 SentenceBias::
00196 operator[](id_type const idx)
00197 {
00198 UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
00199 return m_bias[idx];
00200 }
00201
00202 float
00203 SentenceBias::
00204 operator[](id_type const idx) const
00205 {
00206 UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds");
00207 return m_bias[idx];
00208 }
00209
00210 size_t
00211 SentenceBias::
00212 size() const { return m_bias.size(); }
00213
00214 }
00215