00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00030 #include <cmath>
00031 #include <cstddef>
00032 #include <cstdlib>
00033 #include <ctime>
00034 #include <iostream>
00035 #include <string>
00036 #include <vector>
00037 #include <utility>
00038 
00039 #include <boost/program_options.hpp>
00040 
00041 #include "BleuScorer.h"
00042 #include "FeatureDataIterator.h"
00043 #include "ScoreDataIterator.h"
00044 #include "BleuScorer.h"
00045 #include "Util.h"
00046 #include "util/random.hh"
00047 
00048 using namespace std;
00049 using namespace MosesTuning;
00050 
00051 namespace po = boost::program_options;
00052 
00053 namespace MosesTuning
00054 {
00055 
00056 class SampledPair
00057 {
00058 private:
00059   pair<size_t,size_t> m_translation1;
00060   pair<size_t,size_t> m_translation2;
00061   float m_score_diff;
00062 
00063 public:
00064   SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
00065     if (diff > 0) {
00066       m_translation1 = t1;
00067       m_translation2 = t2;
00068       m_score_diff = diff;
00069     } else {
00070       m_translation1 = t2;
00071       m_translation2 = t1;
00072       m_score_diff = -diff;
00073     }
00074   }
00075 
00076   float getDiff() const {
00077     return m_score_diff;
00078   }
00079   const pair<size_t,size_t>& getTranslation1() const {
00080     return m_translation1;
00081   }
00082   const pair<size_t,size_t>& getTranslation2() const {
00083     return m_translation2;
00084   }
00085 };
00086 
00087 static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2)
00088 {
00089   
00090   for(unsigned int j=0; j<f1.dense.size(); j++)
00091     if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
00092       out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
00093 
00094   if (f1.sparse.size() || f2.sparse.size()) {
00095     out << " ";
00096 
00097     
00098     const SparseVector &s1 = f1.sparse;
00099     const SparseVector &s2 = f2.sparse;
00100     SparseVector diff = s1 - s2;
00101     diff.write(out);
00102   }
00103 }
00104 
00105 }
00106 
00107 int main(int argc, char** argv)
00108 {
00109   bool help;
00110   vector<string> scoreFiles;
00111   vector<string> featureFiles;
00112   int seed;
00113   string outputFile;
00114   
00115   const unsigned int n_candidates = 5000; 
00116   const unsigned int n_samples = 50; 
00117   const float min_diff = 0.05;
00118   bool smoothBP = false;
00119   const float bleuSmoothing = 1.0f;
00120 
00121   po::options_description desc("Allowed options");
00122   desc.add_options()
00123   ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
00124   ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
00125   ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
00126   ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
00127   ("output-file,o", po::value<string>(&outputFile), "Output file")
00128   ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)")
00129   ;
00130 
00131   po::options_description cmdline_options;
00132   cmdline_options.add(desc);
00133   po::variables_map vm;
00134   po::store(po::command_line_parser(argc,argv).
00135             options(cmdline_options).run(), vm);
00136   po::notify(vm);
00137   if (help) {
00138     cout << "Usage: " + string(argv[0]) +  " [options]" << endl;
00139     cout << desc << endl;
00140     exit(0);
00141   }
00142 
00143   if (vm.count("random-seed")) {
00144     cerr << "Initialising random seed to " << seed << endl;
00145     util::rand_init(seed);
00146   } else {
00147     cerr << "Initialising random seed from system clock" << endl;
00148     util::rand_init();
00149   }
00150 
00151   if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
00152     cerr << "No data to process" << endl;
00153     exit(0);
00154   }
00155 
00156   if (featureFiles.size() != scoreFiles.size()) {
00157     cerr << "Error: Number of feature files (" << featureFiles.size() <<
00158          ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
00159     exit(1);
00160   }
00161 
00162   ostream* out;
00163   ofstream outFile;
00164   if (!outputFile.empty() ) {
00165     outFile.open(outputFile.c_str());
00166     if (!(outFile)) {
00167       cerr << "Error: Failed to open " << outputFile << endl;
00168       exit(1);
00169     }
00170     out = &outFile;
00171   } else {
00172     out = &cout;
00173   }
00174 
00175 
00176   vector<FeatureDataIterator> featureDataIters;
00177   vector<ScoreDataIterator> scoreDataIters;
00178   for (size_t i = 0; i < featureFiles.size(); ++i) {
00179     featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
00180     scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
00181   }
00182 
00183   
00184   size_t sentenceId = 0;
00185   while(1) {
00186     vector<pair<size_t,size_t> > hypotheses;
00187     
00188     
00189     if (featureDataIters[0] == FeatureDataIterator::end()) {
00190       break;
00191     }
00192     for (size_t i = 0; i < featureFiles.size(); ++i) {
00193       if (featureDataIters[i] == FeatureDataIterator::end()) {
00194         cerr << "Error: Feature file " << i << " ended prematurely" << endl;
00195         exit(1);
00196       }
00197       if (scoreDataIters[i] == ScoreDataIterator::end()) {
00198         cerr << "Error: Score file " << i << " ended prematurely" << endl;
00199         exit(1);
00200       }
00201       if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
00202         cerr << "Error: For sentence " << sentenceId << " features and scores have different size" << endl;
00203         exit(1);
00204       }
00205       for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
00206         hypotheses.push_back(pair<size_t,size_t>(i,j));
00207       }
00208     }
00209 
00210     
00211     vector<SampledPair> samples;
00212     vector<float> scores;
00213     size_t n_translations = hypotheses.size();
00214     for(size_t  i=0; i<n_candidates; i++) {
00215       size_t rand1 = util::rand_excl(n_translations);
00216       pair<size_t,size_t> translation1 = hypotheses[rand1];
00217       float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
00218 
00219       size_t rand2 = util::rand_excl(n_translations);
00220       pair<size_t,size_t> translation2 = hypotheses[rand2];
00221       float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
00222 
00223       
00224 
00225 
00226 
00227 
00228       if (abs(bleu1-bleu2) < min_diff)
00229         continue;
00230 
00231       samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
00232       scores.push_back(1.0-abs(bleu1-bleu2));
00233     }
00234 
00235     float sample_threshold = -1.0;
00236     if (samples.size() > n_samples) {
00237       NTH_ELEMENT3(scores.begin(), scores.begin() + (n_samples-1), scores.end());
00238       sample_threshold = 0.99999-scores[n_samples-1];
00239     }
00240 
00241     size_t collected = 0;
00242     for (size_t i = 0; collected < n_samples && i < samples.size(); ++i) {
00243       if (samples[i].getDiff() < sample_threshold) continue;
00244       ++collected;
00245       size_t file_id1 = samples[i].getTranslation1().first;
00246       size_t hypo_id1 = samples[i].getTranslation1().second;
00247       size_t file_id2 = samples[i].getTranslation2().first;
00248       size_t hypo_id2 = samples[i].getTranslation2().second;
00249       *out << "1";
00250       outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1),
00251                    featureDataIters[file_id2]->operator[](hypo_id2));
00252       *out << endl;
00253       *out << "0";
00254       outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2),
00255                    featureDataIters[file_id1]->operator[](hypo_id1));
00256       *out << endl;
00257     }
00258     
00259     for (size_t i = 0; i < featureFiles.size(); ++i) {
00260       ++featureDataIters[i];
00261       ++scoreDataIters[i];
00262     }
00263     ++sentenceId;
00264   }
00265 
00266   outFile.close();
00267 
00268 }