00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <algorithm>
00010 #include <cmath>
00011 #include <fstream>
00012
00013 #include "Data.h"
00014 #include "Scorer.h"
00015 #include "ScorerFactory.h"
00016 #include "Util.h"
00017 #include "util/exception.hh"
00018
00019 #include "util/file_piece.hh"
00020 #include "util/random.hh"
00021 #include "util/tokenize_piece.hh"
00022 #include "util/string_piece.hh"
00023 #include "FeatureDataIterator.h"
00024
00025 using namespace std;
00026
00027 namespace MosesTuning
00028 {
00029
00030 Data::Data(Scorer* scorer, const string& sparse_weights_file)
00031 : m_scorer(scorer),
00032 m_score_type(m_scorer->getName()),
00033 m_num_scores(0),
00034 m_score_data(new ScoreData(m_scorer)),
00035 m_feature_data(new FeatureData)
00036 {
00037 TRACE_ERR("Data::m_score_type " << m_score_type << endl);
00038 TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
00039 if (sparse_weights_file.size()) {
00040 m_sparse_weights.load(sparse_weights_file);
00041 ostringstream msg;
00042 msg << "Data::sparse_weights {";
00043 m_sparse_weights.write(msg,"=");
00044 msg << "}";
00045 TRACE_ERR(msg.str() << std::endl);
00046 }
00047 }
00048
00049
00050
00051
00052 void Data::removeDuplicates()
00053 {
00054 size_t nSentences = m_feature_data->size();
00055 assert(m_score_data->size() == nSentences);
00056
00057 for (size_t s = 0; s < nSentences; s++) {
00058 FeatureArray& feat_array = m_feature_data->get(s);
00059 ScoreArray& score_array = m_score_data->get(s);
00060
00061 assert(feat_array.size() == score_array.size());
00062
00063
00064 map<double, vector<size_t> > lookup;
00065
00066 size_t end_pos = feat_array.size() - 1;
00067
00068 size_t nRemoved = 0;
00069
00070 for (size_t k = 0; k <= end_pos; k++) {
00071 const FeatureStats& cur_feats = feat_array.get(k);
00072 double sum = 0.0;
00073 for (size_t l = 0; l < cur_feats.size(); l++)
00074 sum += cur_feats.get(l);
00075
00076 if (lookup.find(sum) != lookup.end()) {
00077
00078
00079 vector<size_t>& cur_list = lookup[sum];
00080
00081
00082
00083
00084 size_t l = 0;
00085 for (l = 0; l < cur_list.size(); l++) {
00086 size_t j = cur_list[l];
00087
00088 if (cur_feats == feat_array.get(j)
00089 && score_array.get(k) == score_array.get(j)) {
00090 if (k < end_pos) {
00091 feat_array.swap(k,end_pos);
00092 score_array.swap(k,end_pos);
00093 k--;
00094 }
00095 end_pos--;
00096 nRemoved++;
00097 break;
00098 }
00099 }
00100 if (l == lookup[sum].size())
00101 cur_list.push_back(k);
00102 } else {
00103 lookup[sum].push_back(k);
00104 }
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123 }
00124
00125 if (nRemoved > 0) {
00126 feat_array.resize(end_pos+1);
00127 score_array.resize(end_pos+1);
00128 }
00129 }
00130 }
00131
00132
00133 void Data::load(const std::string &featfile, const std::string &scorefile)
00134 {
00135 m_feature_data->load(featfile, m_sparse_weights);
00136 m_score_data->load(scorefile);
00137 }
00138
00139 void Data::loadNBest(const string &file, bool oneBest)
00140 {
00141 TRACE_ERR("loading nbest from " << file << endl);
00142 util::FilePiece in(file.c_str());
00143
00144 ScoreStats scoreentry;
00145 string sentence, feature_str, alignment;
00146 int sentence_index;
00147
00148 while (true) {
00149 try {
00150 StringPiece line = in.ReadLine();
00151 if (line.empty()) continue;
00152
00153 scoreentry.clear();
00154
00155 util::TokenIter<util::MultiCharacter> it(line, util::MultiCharacter("|||"));
00156
00157 sentence_index = ParseInt(*it);
00158 if (oneBest && m_score_data->exists(sentence_index)) continue;
00159 ++it;
00160 sentence = it->as_string();
00161 ++it;
00162 feature_str = it->as_string();
00163 ++it;
00164
00165 if (it) {
00166 ++it;
00167
00168 if (it) {
00169 alignment = it->as_string();
00170 ++it;
00171 if (it) {
00172 alignment = it->as_string();
00173 }
00174 }
00175 }
00176
00177
00178 if (m_scorer->useAlignment()) {
00179 sentence += "|||";
00180 sentence += alignment;
00181 }
00182 m_scorer->prepareStats(sentence_index, sentence, scoreentry);
00183
00184 m_score_data->add(scoreentry, sentence_index);
00185
00186
00187 if (!existsFeatureNames()) {
00188 InitFeatureMap(feature_str);
00189 }
00190 AddFeatures(feature_str, sentence_index);
00191 } catch (util::EndOfFileException &e) {
00192 PrintUserTime("Loaded N-best lists");
00193 break;
00194 }
00195 }
00196 }
00197
00198 void Data::save(const std::string &featfile, const std::string &scorefile, bool bin)
00199 {
00200 if (bin)
00201 cerr << "Binary write mode is selected" << endl;
00202 else
00203 cerr << "Binary write mode is NOT selected" << endl;
00204
00205 m_feature_data->save(featfile, bin);
00206 m_score_data->save(scorefile, bin);
00207 }
00208
00209 void Data::InitFeatureMap(const string& str)
00210 {
00211 string buf = str;
00212 string substr;
00213 string features = "";
00214 string tmp_name = "";
00215 size_t tmp_index = 0;
00216
00217 while (!buf.empty()) {
00218 getNextPound(buf, substr);
00219
00220
00221 if (!EndsWith(substr, "=")) {
00222 stringstream ss;
00223 ss << tmp_name << "_" << tmp_index << " ";
00224 features.append(ss.str());
00225
00226 tmp_index++;
00227 } else if (substr.find("_") != string::npos) {
00228
00229 getNextPound(buf, substr);
00230 } else {
00231 tmp_index = 0;
00232 tmp_name = substr.substr(0, substr.size() - 1);
00233 }
00234 }
00235 m_feature_data->setFeatureMap(features);
00236 }
00237
00238 void Data::AddFeatures(const string& str,
00239 int sentence_index)
00240 {
00241 string buf = str;
00242 string substr;
00243 FeatureStats feature_entry;
00244 feature_entry.reset();
00245
00246 while (!buf.empty()) {
00247 getNextPound(buf, substr);
00248
00249
00250 if (!EndsWith(substr, "=")) {
00251 feature_entry.add(ConvertStringToFeatureStatsType(substr));
00252 } else if (substr.find("_") != string::npos) {
00253
00254 string name = substr;
00255 getNextPound(buf, substr);
00256 feature_entry.addSparse(name, atof(substr.c_str()));
00257 }
00258 }
00259 m_feature_data->add(feature_entry, sentence_index);
00260 }
00261
00262 void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
00263 vector<Data>& shards)
00264 {
00265 UTIL_THROW_IF(shard_count == 0, util::Exception, "Must have at least 1 shard");
00266 UTIL_THROW_IF(shard_size < 0 || shard_size > 1,
00267 util::Exception,
00268 "Shard size must be between 0 and 1, inclusive. Currently " << shard_size);
00269
00270 size_t data_size = m_score_data->size();
00271 UTIL_THROW_IF(data_size != m_feature_data->size(),
00272 util::Exception,
00273 "Error");
00274
00275 shard_size *= data_size;
00276 const float coeff = static_cast<float>(data_size) / shard_count;
00277
00278 for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
00279 vector<size_t> shard_contents;
00280 if (shard_size == 0) {
00281
00282 const size_t shard_start = floor(0.5 + shard_id * coeff);
00283 const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
00284 for (size_t i = shard_start; i < shard_end; ++i) {
00285 shard_contents.push_back(i);
00286 }
00287 } else {
00288
00289 for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
00290 shard_contents.push_back(util::rand_excl(data_size));
00291 }
00292 }
00293
00294 Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
00295
00296 shards.push_back(Data(scorer));
00297 shards.back().m_score_type = m_score_type;
00298 shards.back().m_num_scores = m_num_scores;
00299 for (size_t i = 0; i < shard_contents.size(); ++i) {
00300 shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
00301 shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
00302 }
00303
00304 }
00305 }
00306
00307 }