00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00030 #include <cmath>
00031 #include <cstddef>
00032 #include <cstdlib>
00033 #include <ctime>
00034 #include <iostream>
00035 #include <string>
00036 #include <vector>
00037 #include <utility>
00038
00039 #include <boost/program_options.hpp>
00040
00041 #include "BleuScorer.h"
00042 #include "FeatureDataIterator.h"
00043 #include "ScoreDataIterator.h"
00044 #include "BleuScorer.h"
00045 #include "Util.h"
00046 #include "util/random.hh"
00047
00048 using namespace std;
00049 using namespace MosesTuning;
00050
00051 namespace po = boost::program_options;
00052
00053 namespace MosesTuning
00054 {
00055
00056 class SampledPair
00057 {
00058 private:
00059 pair<size_t,size_t> m_translation1;
00060 pair<size_t,size_t> m_translation2;
00061 float m_score_diff;
00062
00063 public:
00064 SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
00065 if (diff > 0) {
00066 m_translation1 = t1;
00067 m_translation2 = t2;
00068 m_score_diff = diff;
00069 } else {
00070 m_translation1 = t2;
00071 m_translation2 = t1;
00072 m_score_diff = -diff;
00073 }
00074 }
00075
00076 float getDiff() const {
00077 return m_score_diff;
00078 }
00079 const pair<size_t,size_t>& getTranslation1() const {
00080 return m_translation1;
00081 }
00082 const pair<size_t,size_t>& getTranslation2() const {
00083 return m_translation2;
00084 }
00085 };
00086
00087 static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2)
00088 {
00089
00090 for(unsigned int j=0; j<f1.dense.size(); j++)
00091 if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
00092 out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
00093
00094 if (f1.sparse.size() || f2.sparse.size()) {
00095 out << " ";
00096
00097
00098 const SparseVector &s1 = f1.sparse;
00099 const SparseVector &s2 = f2.sparse;
00100 SparseVector diff = s1 - s2;
00101 diff.write(out);
00102 }
00103 }
00104
00105 }
00106
00107 int main(int argc, char** argv)
00108 {
00109 bool help;
00110 vector<string> scoreFiles;
00111 vector<string> featureFiles;
00112 int seed;
00113 string outputFile;
00114
00115 const unsigned int n_candidates = 5000;
00116 const unsigned int n_samples = 50;
00117 const float min_diff = 0.05;
00118 bool smoothBP = false;
00119 const float bleuSmoothing = 1.0f;
00120
00121 po::options_description desc("Allowed options");
00122 desc.add_options()
00123 ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
00124 ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
00125 ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
00126 ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
00127 ("output-file,o", po::value<string>(&outputFile), "Output file")
00128 ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)")
00129 ;
00130
00131 po::options_description cmdline_options;
00132 cmdline_options.add(desc);
00133 po::variables_map vm;
00134 po::store(po::command_line_parser(argc,argv).
00135 options(cmdline_options).run(), vm);
00136 po::notify(vm);
00137 if (help) {
00138 cout << "Usage: " + string(argv[0]) + " [options]" << endl;
00139 cout << desc << endl;
00140 exit(0);
00141 }
00142
00143 if (vm.count("random-seed")) {
00144 cerr << "Initialising random seed to " << seed << endl;
00145 util::rand_init(seed);
00146 } else {
00147 cerr << "Initialising random seed from system clock" << endl;
00148 util::rand_init();
00149 }
00150
00151 if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
00152 cerr << "No data to process" << endl;
00153 exit(0);
00154 }
00155
00156 if (featureFiles.size() != scoreFiles.size()) {
00157 cerr << "Error: Number of feature files (" << featureFiles.size() <<
00158 ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
00159 exit(1);
00160 }
00161
00162 ostream* out;
00163 ofstream outFile;
00164 if (!outputFile.empty() ) {
00165 outFile.open(outputFile.c_str());
00166 if (!(outFile)) {
00167 cerr << "Error: Failed to open " << outputFile << endl;
00168 exit(1);
00169 }
00170 out = &outFile;
00171 } else {
00172 out = &cout;
00173 }
00174
00175
00176 vector<FeatureDataIterator> featureDataIters;
00177 vector<ScoreDataIterator> scoreDataIters;
00178 for (size_t i = 0; i < featureFiles.size(); ++i) {
00179 featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
00180 scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
00181 }
00182
00183
00184 size_t sentenceId = 0;
00185 while(1) {
00186 vector<pair<size_t,size_t> > hypotheses;
00187
00188
00189 if (featureDataIters[0] == FeatureDataIterator::end()) {
00190 break;
00191 }
00192 for (size_t i = 0; i < featureFiles.size(); ++i) {
00193 if (featureDataIters[i] == FeatureDataIterator::end()) {
00194 cerr << "Error: Feature file " << i << " ended prematurely" << endl;
00195 exit(1);
00196 }
00197 if (scoreDataIters[i] == ScoreDataIterator::end()) {
00198 cerr << "Error: Score file " << i << " ended prematurely" << endl;
00199 exit(1);
00200 }
00201 if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
00202 cerr << "Error: For sentence " << sentenceId << " features and scores have different size" << endl;
00203 exit(1);
00204 }
00205 for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
00206 hypotheses.push_back(pair<size_t,size_t>(i,j));
00207 }
00208 }
00209
00210
00211 vector<SampledPair> samples;
00212 vector<float> scores;
00213 size_t n_translations = hypotheses.size();
00214 for(size_t i=0; i<n_candidates; i++) {
00215 size_t rand1 = util::rand_excl(n_translations);
00216 pair<size_t,size_t> translation1 = hypotheses[rand1];
00217 float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
00218
00219 size_t rand2 = util::rand_excl(n_translations);
00220 pair<size_t,size_t> translation2 = hypotheses[rand2];
00221 float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
00222
00223
00224
00225
00226
00227
00228 if (abs(bleu1-bleu2) < min_diff)
00229 continue;
00230
00231 samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
00232 scores.push_back(1.0-abs(bleu1-bleu2));
00233 }
00234
00235 float sample_threshold = -1.0;
00236 if (samples.size() > n_samples) {
00237 NTH_ELEMENT3(scores.begin(), scores.begin() + (n_samples-1), scores.end());
00238 sample_threshold = 0.99999-scores[n_samples-1];
00239 }
00240
00241 size_t collected = 0;
00242 for (size_t i = 0; collected < n_samples && i < samples.size(); ++i) {
00243 if (samples[i].getDiff() < sample_threshold) continue;
00244 ++collected;
00245 size_t file_id1 = samples[i].getTranslation1().first;
00246 size_t hypo_id1 = samples[i].getTranslation1().second;
00247 size_t file_id2 = samples[i].getTranslation2().first;
00248 size_t hypo_id2 = samples[i].getTranslation2().second;
00249 *out << "1";
00250 outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1),
00251 featureDataIters[file_id2]->operator[](hypo_id2));
00252 *out << endl;
00253 *out << "0";
00254 outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2),
00255 featureDataIters[file_id1]->operator[](hypo_id1));
00256 *out << endl;
00257 }
00258
00259 for (size_t i = 0; i < featureFiles.size(); ++i) {
00260 ++featureDataIters[i];
00261 ++scoreDataIters[i];
00262 }
00263 ++sentenceId;
00264 }
00265
00266 outFile.close();
00267
00268 }