00001
00005 #include <limits>
00006 #include <unistd.h>
00007 #include <cstdlib>
00008 #include <iostream>
00009 #include <fstream>
00010 #include <cmath>
00011 #include <ctime>
00012
00013 #include <getopt.h>
00014 #include <boost/scoped_ptr.hpp>
00015
00016 #include "Data.h"
00017 #include "Point.h"
00018 #include "Scorer.h"
00019 #include "ScorerFactory.h"
00020 #include "ScoreData.h"
00021 #include "FeatureData.h"
00022 #include "Optimizer.h"
00023 #include "OptimizerFactory.h"
00024 #include "Types.h"
00025 #include "Timer.h"
00026 #include "Util.h"
00027 #include "util/random.hh"
00028
00029 #include "moses/ThreadPool.h"
00030
00031 using namespace std;
00032 using namespace MosesTuning;
00033
00034 namespace
00035 {
00036
00037 const char kDefaultOptimizer[] = "powell";
00038 const char kDefaultScorer[] = "BLEU";
00039 const char kDefaultScorerFile[] = "statscore.data";
00040 const char kDefaultFeatureFile[] = "features.data";
00041 const char kDefaultInitFile[] = "init.opt";
00042 const char kDefaultPositiveString[] = "";
00043 const char kDefaultSparseWeightsFile[] = "";
00044
00045
00046 const char kOutputFile[] = "weights.txt";
00047
00051 class OptimizationTask : public Moses::Task
00052 {
00053 public:
00054 OptimizationTask(Optimizer* optimizer, const Point& point)
00055 : m_optimizer(optimizer), m_point(point) {}
00056
00057 ~OptimizationTask() {}
00058
00059 virtual void Run() {
00060 m_score = m_optimizer->Run(m_point);
00061 }
00062
00063 virtual bool DeleteAfterExecution() {
00064 return false;
00065 }
00066
00067 void resetOptimizer() {
00068 if (m_optimizer) {
00069 delete m_optimizer;
00070 m_optimizer = NULL;
00071 }
00072 }
00073
00074 statscore_t getScore() const {
00075 return m_score;
00076 }
00077
00078 const Point& getPoint() const {
00079 return m_point;
00080 }
00081
00082 private:
00083
00084 OptimizationTask() {}
00085
00086 Optimizer* m_optimizer;
00087 Point m_point;
00088 statscore_t m_score;
00089 };
00090
00091 bool WriteFinalWeights(const char* filename, const Point& point)
00092 {
00093 ofstream ofs(filename);
00094 if (!ofs) {
00095 cerr << "Cannot open " << filename << endl;
00096 return false;
00097 }
00098
00099 ofs << point << endl;
00100
00101 return true;
00102 }
00103
00104
00105 void usage(int ret)
00106 {
00107 cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
00108 cerr<<"[-n] retry ntimes (default 1)"<<endl;
00109 cerr<<"[-m] number of random directions in powell (default 0)"<<endl;
00110 cerr<<"[-o] the indexes to optimize(default all)"<<endl;
00111 cerr<<"[-t] the optimizer(default powell)"<<endl;
00112 cerr<<"[-r] the random seed (defaults to system clock)"<<endl;
00113 cerr<<"[--sctype|-s] the scorer type (default BLEU)"<<endl;
00114 cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
00115 cerr<<"[--scfile|-S] comma separated list of scorer data files (default score.data)"<<endl;
00116 cerr<<"[--ffile|-F] comma separated list of feature data files (default feature.data)"<<endl;
00117 cerr<<"[--ifile|-i] the starting point data file (default init.opt)"<<endl;
00118 cerr<<"[--sparse-weights|-p] required for merging sparse features"<<endl;
00119 #ifdef WITH_THREADS
00120 cerr<<"[--threads|-T] use multiple threads (default 1)"<<endl;
00121 #endif
00122 cerr<<"[--shard-count] Split data into shards, optimize for each shard and average"<<endl;
00123 cerr<<"[--shard-size] Shard size as proportion of data. If 0, use non-overlapping shards"<<endl;
00124 cerr<<"[-v] verbose level"<<endl;
00125 cerr<<"[--help|-h] print this message and exit"<<endl;
00126 exit(ret);
00127 }
00128
00129 static struct option long_options[] = {
00130 {"pdim", 1, 0, 'd'},
00131 {"ntry",1,0,'n'},
00132 {"nrandom",1,0,'m'},
00133 {"rseed",required_argument,0,'r'},
00134 {"optimize",1,0,'o'},
00135 {"type",1,0,'t'},
00136 {"sctype",1,0,'s'},
00137 {"scconfig",required_argument,0,'c'},
00138 {"scfile",1,0,'S'},
00139 {"ffile",1,0,'F'},
00140 {"ifile",1,0,'i'},
00141 {"sparse-weights",required_argument,0,'p'},
00142 #ifdef WITH_THREADS
00143 {"threads", required_argument,0,'T'},
00144 #endif
00145 {"shard-count", required_argument, 0, 'a'},
00146 {"shard-size", required_argument, 0, 'b'},
00147 {"verbose",1,0,'v'},
00148 {"help",no_argument,0,'h'},
00149 {0, 0, 0, 0}
00150 };
00151
00152 struct ProgramOption {
00153 string to_optimize_str;
00154 int pdim;
00155 int ntry;
00156 int nrandom;
00157 int seed;
00158 bool has_seed;
00159 string optimize_type;
00160 string scorer_type;
00161 string scorer_config;
00162 string scorer_file;
00163 string feature_file;
00164 string init_file;
00165 string positive_string;
00166 string sparse_weights_file;
00167 size_t num_threads;
00168 float shard_size;
00169 size_t shard_count;
00170
00171 ProgramOption()
00172 : to_optimize_str(""),
00173 pdim(-1),
00174 ntry(1),
00175 nrandom(0),
00176 seed(0),
00177 has_seed(false),
00178 optimize_type(kDefaultOptimizer),
00179 scorer_type(kDefaultScorer),
00180 scorer_config(""),
00181 scorer_file(kDefaultScorerFile),
00182 feature_file(kDefaultFeatureFile),
00183 init_file(kDefaultInitFile),
00184 positive_string(kDefaultPositiveString),
00185 sparse_weights_file(kDefaultSparseWeightsFile),
00186 num_threads(1),
00187 shard_size(0),
00188 shard_count(0) { }
00189 };
00190
00191 void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
00192 {
00193 int c;
00194 int option_index;
00195
00196 while ((c = getopt_long(argc, argv, "o:r:d:n:m:t:s:S:F:v:p:P:", long_options, &option_index)) != -1) {
00197 switch (c) {
00198 case 'o':
00199 opt->to_optimize_str = string(optarg);
00200 break;
00201 case 'd':
00202 opt->pdim = strtol(optarg, NULL, 10);
00203 break;
00204 case 'n':
00205 opt->ntry = strtol(optarg, NULL, 10);
00206 break;
00207 case 'm':
00208 opt->nrandom = strtol(optarg, NULL, 10);
00209 break;
00210 case 'r':
00211 opt->seed = strtol(optarg, NULL, 10);
00212 opt->has_seed = true;
00213 break;
00214 case 't':
00215 opt->optimize_type = string(optarg);
00216 break;
00217 case's':
00218 opt->scorer_type = string(optarg);
00219 break;
00220 case 'c':
00221 opt->scorer_config = string(optarg);
00222 break;
00223 case 'S':
00224 opt->scorer_file = string(optarg);
00225 break;
00226 case 'F':
00227 opt->feature_file = string(optarg);
00228 break;
00229 case 'i':
00230 opt->init_file = string(optarg);
00231 break;
00232 case 'p':
00233 opt->sparse_weights_file=string(optarg);
00234 break;
00235 case 'v':
00236 setverboselevel(strtol(optarg, NULL, 10));
00237 break;
00238 #ifdef WITH_THREADS
00239 case 'T':
00240 opt->num_threads = strtol(optarg, NULL, 10);
00241 if (opt->num_threads < 1) opt->num_threads = 1;
00242 break;
00243 #endif
00244 case 'a':
00245 opt->shard_count = strtof(optarg, NULL);
00246 break;
00247 case 'b':
00248 opt->shard_size = strtof(optarg, NULL);
00249 break;
00250 case 'h':
00251 usage(0);
00252 break;
00253 case 'P':
00254 opt->positive_string = string(optarg);
00255 break;
00256 default:
00257 usage(1);
00258 }
00259 }
00260 }
00261
00262 }
00263
00264 int main(int argc, char **argv)
00265 {
00266 ResetUserTime();
00267
00268 ProgramOption option;
00269 ParseCommandOptions(argc, argv, &option);
00270
00271 vector<unsigned> to_optimize;
00272 vector<vector<parameter_t> > start_list;
00273 vector<parameter_t> min;
00274 vector<parameter_t> max;
00275 vector<bool> positive;
00276
00277
00278 if (option.pdim < 0)
00279 usage(1);
00280
00281 cerr << "shard_size = " << option.shard_size << " shard_count = " << option.shard_count << endl;
00282 if (option.shard_size && !option.shard_count) {
00283 cerr << "Error: shard-size provided without shard-count" << endl;
00284 exit(1);
00285 }
00286 if (option.shard_size > 1 || option.shard_size < 0) {
00287 cerr << "Error: shard-size should be between 0 and 1" << endl;
00288 exit(1);
00289 }
00290
00291 if (option.has_seed) {
00292 cerr << "Seeding random numbers with " << option.seed << endl;
00293 util::rand_init(option.seed);
00294 } else {
00295 cerr << "Seeding random numbers with system clock " << endl;
00296 util::rand_init();
00297 }
00298
00299 if (option.sparse_weights_file.size()) ++option.pdim;
00300
00301
00302 string onefile;
00303 while (!option.init_file.empty()) {
00304 getNextPound(option.init_file, onefile, ",");
00305 vector<parameter_t> start;
00306 ifstream opt(onefile.c_str());
00307 if (opt.fail()) {
00308 cerr << "could not open initfile: " << option.init_file << endl;
00309 exit(3);
00310 }
00311 start.resize(option.pdim);
00312 int j;
00313 for (j = 0; j < option.pdim && !opt.fail(); j++) {
00314 opt >> start[j];
00315 }
00316 if (j < option.pdim) {
00317 cerr << option.init_file << ":Too few starting weights." << endl;
00318 exit(3);
00319 }
00320 start_list.push_back(start);
00321
00322 if (start_list.size() == 1) {
00323 min.resize(option.pdim);
00324 for (j = 0; j < option.pdim && !opt.fail(); j++) {
00325 opt >> min[j];
00326 }
00327 if (j < option.pdim) {
00328 cerr << option.init_file << ":Too few minimum weights." << endl;
00329 cerr << "error could not initialize start point with " << option.init_file << endl;
00330 cerr << "j: " << j << ", pdim: " << option.pdim << endl;
00331 exit(3);
00332 }
00333 max.resize(option.pdim);
00334 for (j = 0; j < option.pdim && !opt.fail(); j++) {
00335 opt >> max[j];
00336 }
00337 if (j < option.pdim) {
00338 cerr << option.init_file << ":Too few maximum weights." << endl;
00339 exit(3);
00340 }
00341 }
00342 opt.close();
00343 }
00344
00345 vector<string> ScoreDataFiles;
00346 if (option.scorer_file.length() > 0) {
00347 Tokenize(option.scorer_file.c_str(), ',', &ScoreDataFiles);
00348 }
00349
00350 vector<string> FeatureDataFiles;
00351 if (option.feature_file.length() > 0) {
00352 Tokenize(option.feature_file.c_str(), ',', &FeatureDataFiles);
00353 }
00354
00355 if (ScoreDataFiles.size() != FeatureDataFiles.size()) {
00356 throw runtime_error("Error: there is a different number of previous score and feature files");
00357 }
00358
00359
00360 boost::scoped_ptr<Scorer> scorer(
00361 ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
00362
00363
00364 Data data(scorer.get(), option.sparse_weights_file);
00365
00366 for (size_t i = 0; i < ScoreDataFiles.size(); i++) {
00367 cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
00368 data.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
00369 }
00370
00371 scorer->setScoreData(data.getScoreData().get());
00372
00373 data.removeDuplicates();
00374
00375 PrintUserTime("Data loaded");
00376
00377
00378
00379
00380
00381
00382
00383 if (option.to_optimize_str.length() > 0) {
00384 cerr << "Weights to optimize: " << option.to_optimize_str << endl;
00385
00386
00387 vector<string> features;
00388 Tokenize(option.to_optimize_str.c_str(), ',', &features);
00389
00390 for (vector<string>::const_iterator it = features.begin();
00391 it != features.end(); ++it) {
00392 const int feature_index = data.getFeatureIndex(*it);
00393
00394
00395
00396
00397
00398 if (feature_index < 0) {
00399 cerr << "Error: invalid feature index = " << feature_index << endl;
00400 exit(1);
00401 }
00402 cerr << "FeatNameIndex: " << feature_index << " to insert" << endl;
00403 to_optimize.push_back(feature_index);
00404 }
00405 } else {
00406
00407 to_optimize.resize(option.pdim);
00408 for (int i = 0; i < option.pdim; i++) {
00409 to_optimize[i] = 1;
00410 }
00411 }
00412
00413 positive.resize(option.pdim);
00414 for (int i = 0; i < option.pdim; i++)
00415 positive[i] = false;
00416 if (option.positive_string.length() > 0) {
00417
00418 std::string substring;
00419 int index;
00420 while (!option.positive_string.empty()) {
00421 getNextPound(option.positive_string, substring, ",");
00422 index = data.getFeatureIndex(substring);
00423
00424 if (index >= 0 && index < option.pdim) {
00425 positive[index] = true;
00426 } else {
00427 cerr << "Index " << index
00428 << " is out of bounds in positivity list. Allowed indexes are [0,"
00429 << (option.pdim-1) << "]." << endl;
00430 }
00431 }
00432 }
00433
00434 #ifdef WITH_THREADS
00435 cerr << "Creating a pool of " << option.num_threads << " threads" << endl;
00436 Moses::ThreadPool pool(option.num_threads);
00437 #endif
00438
00439 Point::setpdim(option.pdim);
00440 Point::setdim(to_optimize.size());
00441 Point::set_optindices(to_optimize);
00442
00443
00444 vector<Point> startingPoints;
00445
00446 for (size_t i = 0; i < start_list.size(); ++i) {
00447 startingPoints.push_back(Point(start_list[i], min, max));
00448 }
00449
00450 for (int i = 0; i < option.ntry; ++i) {
00451 startingPoints.push_back(Point(start_list[0], min, max));
00452 startingPoints.back().Randomize();
00453 }
00454
00455 vector<vector<boost::shared_ptr<OptimizationTask> > > allTasks(1);
00456
00457
00458 vector<Data> shards;
00459 if (option.shard_count) {
00460 data.createShards(option.shard_count, option.shard_size, option.scorer_config, shards);
00461 allTasks.resize(option.shard_count);
00462 }
00463
00464
00465 for (size_t i = 0; i < allTasks.size(); ++i) {
00466 Data& data_ref = data;
00467 if (option.shard_count)
00468 data_ref = shards[i];
00469
00470 vector<boost::shared_ptr<OptimizationTask> >& tasks = allTasks[i];
00471 Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, positive, start_list[0], option.optimize_type, option.nrandom);
00472 optimizer->SetScorer(data_ref.getScorer());
00473 optimizer->SetFeatureData(data_ref.getFeatureData());
00474
00475 for (size_t j = 0; j < startingPoints.size(); ++j) {
00476 boost::shared_ptr<OptimizationTask>
00477 task(new OptimizationTask(optimizer, startingPoints[j]));
00478 tasks.push_back(task);
00479 #ifdef WITH_THREADS
00480 pool.Submit(task);
00481 #else
00482 task->Run();
00483 #endif
00484 }
00485 }
00486
00487
00488 #ifdef WITH_THREADS
00489 pool.Stop(true);
00490 #endif
00491
00492 statscore_t total = 0;
00493 Point totalP;
00494
00495
00496 for (size_t i = 0; i < allTasks.size(); ++i) {
00497 statscore_t best = 0, mean = 0, var = 0;
00498 Point bestP;
00499 for (size_t j = 0; j < allTasks[i].size(); ++j) {
00500 statscore_t score = allTasks[i][j]->getScore();
00501 mean += score;
00502 var += score * score;
00503 if (score > best) {
00504 bestP = allTasks[i][j]->getPoint();
00505 best = score;
00506 }
00507 }
00508
00509 mean /= static_cast<float>(option.ntry);
00510 var /= static_cast<float>(option.ntry);
00511 var = sqrt(abs(var - mean * mean));
00512
00513 if (verboselevel() > 1) {
00514 cerr << "shard " << i << " best score: " << best << " variance of the score (for " << option.ntry << " try): " << var << endl;
00515 }
00516
00517 totalP += bestP;
00518 total += best;
00519 if (verboselevel() > 1)
00520 cerr << "bestP " << bestP << endl;
00521 }
00522
00523
00524 Point finalP = totalP * (1.0 / allTasks.size());
00525 statscore_t final = total / allTasks.size();
00526
00527 if (verboselevel() > 1)
00528 cerr << "bestP: " << finalP << endl;
00529
00530
00531 if (static_cast<int>(to_optimize.size()) == option.pdim) {
00532 finalP.NormalizeL1();
00533 }
00534
00535 cerr << "Best point: " << finalP << " => " << final << endl;
00536
00537 if (!WriteFinalWeights(kOutputFile, finalP)) {
00538 cerr << "Warning: Failed to write the final point" << endl;
00539 }
00540
00541 for (size_t i = 0; i < allTasks.size(); ++i) {
00542 allTasks[i][0]->resetOptimizer();
00543 }
00544
00545 PrintUserTime("Stopping...");
00546
00547 return 0;
00548 }