Moses: /disk4/html/www/moses/doxygen/mosesdecoder/mert/pro.cpp Source File

00001 // $Id$
00002 // vim:tabstop=2
00003 
00004 /***********************************************************************
00005 Moses - factored phrase-based language decoder
00006 Copyright (C) 2011- University of Edinburgh
00007 
00008 This library is free software; you can redistribute it and/or
00009 modify it under the terms of the GNU Lesser General Public
00010 License as published by the Free Software Foundation; either
00011 version 2.1 of the License, or (at your option) any later version.
00012 
00013 This library is distributed in the hope that it will be useful,
00014 but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 Lesser General Public License for more details.
00017 
00018 You should have received a copy of the GNU Lesser General Public
00019 License along with this library; if not, write to the Free Software
00020 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00021 ***********************************************************************/
00022 
00023 
00030 #include <cmath>
00031 #include <cstddef>
00032 #include <cstdlib>
00033 #include <ctime>
00034 #include <iostream>
00035 #include <string>
00036 #include <vector>
00037 #include <utility>
00038 
00039 #include <boost/program_options.hpp>
00040 
00041 #include "BleuScorer.h"
00042 #include "FeatureDataIterator.h"
00043 #include "ScoreDataIterator.h"
00044 #include "BleuScorer.h"
00045 #include "Util.h"
00046 #include "util/random.hh"
00047 
00048 using namespace std;
00049 using namespace MosesTuning;
00050 
00051 namespace po = boost::program_options;
00052 
00053 namespace MosesTuning
00054 {
00055 
00056 class SampledPair
00057 {
00058 private:
00059   pair<size_t,size_t> m_translation1;
00060   pair<size_t,size_t> m_translation2;
00061   float m_score_diff;
00062 
00063 public:
00064   SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
00065     if (diff > 0) {
00066       m_translation1 = t1;
00067       m_translation2 = t2;
00068       m_score_diff = diff;
00069     } else {
00070       m_translation1 = t2;
00071       m_translation2 = t1;
00072       m_score_diff = -diff;
00073     }
00074   }
00075 
00076   float getDiff() const {
00077     return m_score_diff;
00078   }
00079   const pair<size_t,size_t>& getTranslation1() const {
00080     return m_translation1;
00081   }
00082   const pair<size_t,size_t>& getTranslation2() const {
00083     return m_translation2;
00084   }
00085 };
00086 
00087 static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2)
00088 {
00089   // difference in score in regular features
00090   for(unsigned int j=0; j<f1.dense.size(); j++)
00091     if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
00092       out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
00093 
00094   if (f1.sparse.size() || f2.sparse.size()) {
00095     out << " ";
00096 
00097     // sparse features
00098     const SparseVector &s1 = f1.sparse;
00099     const SparseVector &s2 = f2.sparse;
00100     SparseVector diff = s1 - s2;
00101     diff.write(out);
00102   }
00103 }
00104 
00105 }
00106 
00107 int main(int argc, char** argv)
00108 {
00109   bool help;
00110   vector<string> scoreFiles;
00111   vector<string> featureFiles;
00112   int seed;
00113   string outputFile;
00114   // TODO: Add these constants to options
00115   const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
00116   const unsigned int n_samples = 50; // Xi, in Hopkins & May
00117   const float min_diff = 0.05;
00118   bool smoothBP = false;
00119   const float bleuSmoothing = 1.0f;
00120 
00121   po::options_description desc("Allowed options");
00122   desc.add_options()
00123   ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
00124   ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
00125   ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
00126   ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
00127   ("output-file,o", po::value<string>(&outputFile), "Output file")
00128   ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)")
00129   ;
00130 
00131   po::options_description cmdline_options;
00132   cmdline_options.add(desc);
00133   po::variables_map vm;
00134   po::store(po::command_line_parser(argc,argv).
00135             options(cmdline_options).run(), vm);
00136   po::notify(vm);
00137   if (help) {
00138     cout << "Usage: " + string(argv[0]) +  " [options]" << endl;
00139     cout << desc << endl;
00140     exit(0);
00141   }
00142 
00143   if (vm.count("random-seed")) {
00144     cerr << "Initialising random seed to " << seed << endl;
00145     util::rand_init(seed);
00146   } else {
00147     cerr << "Initialising random seed from system clock" << endl;
00148     util::rand_init();
00149   }
00150 
00151   if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
00152     cerr << "No data to process" << endl;
00153     exit(0);
00154   }
00155 
00156   if (featureFiles.size() != scoreFiles.size()) {
00157     cerr << "Error: Number of feature files (" << featureFiles.size() <<
00158          ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
00159     exit(1);
00160   }
00161 
00162   ostream* out;
00163   ofstream outFile;
00164   if (!outputFile.empty() ) {
00165     outFile.open(outputFile.c_str());
00166     if (!(outFile)) {
00167       cerr << "Error: Failed to open " << outputFile << endl;
00168       exit(1);
00169     }
00170     out = &outFile;
00171   } else {
00172     out = &cout;
00173   }
00174 
00175 
00176   vector<FeatureDataIterator> featureDataIters;
00177   vector<ScoreDataIterator> scoreDataIters;
00178   for (size_t i = 0; i < featureFiles.size(); ++i) {
00179     featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
00180     scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
00181   }
00182 
00183   //loop through nbest lists
00184   size_t sentenceId = 0;
00185   while(1) {
00186     vector<pair<size_t,size_t> > hypotheses;
00187     //TODO: de-deuping. Collect hashes of score,feature pairs and
00188     //only add index if it's unique.
00189     if (featureDataIters[0] == FeatureDataIterator::end()) {
00190       break;
00191     }
00192     for (size_t i = 0; i < featureFiles.size(); ++i) {
00193       if (featureDataIters[i] == FeatureDataIterator::end()) {
00194         cerr << "Error: Feature file " << i << " ended prematurely" << endl;
00195         exit(1);
00196       }
00197       if (scoreDataIters[i] == ScoreDataIterator::end()) {
00198         cerr << "Error: Score file " << i << " ended prematurely" << endl;
00199         exit(1);
00200       }
00201       if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
00202         cerr << "Error: For sentence " << sentenceId << " features and scores have different size" << endl;
00203         exit(1);
00204       }
00205       for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
00206         hypotheses.push_back(pair<size_t,size_t>(i,j));
00207       }
00208     }
00209 
00210     //collect the candidates
00211     vector<SampledPair> samples;
00212     vector<float> scores;
00213     size_t n_translations = hypotheses.size();
00214     for(size_t  i=0; i<n_candidates; i++) {
00215       size_t rand1 = util::rand_excl(n_translations);
00216       pair<size_t,size_t> translation1 = hypotheses[rand1];
00217       float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
00218 
00219       size_t rand2 = util::rand_excl(n_translations);
00220       pair<size_t,size_t> translation2 = hypotheses[rand2];
00221       float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
00222 
00223       /*
00224       cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
00225         " t(" << translation2.first << "," << translation2.second << ") = " <<
00226           bleu2  << " diff = " << abs(bleu1-bleu2) << endl;
00227       */
00228       if (abs(bleu1-bleu2) < min_diff)
00229         continue;
00230 
00231       samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
00232       scores.push_back(1.0-abs(bleu1-bleu2));
00233     }
00234 
00235     float sample_threshold = -1.0;
00236     if (samples.size() > n_samples) {
00237       NTH_ELEMENT3(scores.begin(), scores.begin() + (n_samples-1), scores.end());
00238       sample_threshold = 0.99999-scores[n_samples-1];
00239     }
00240 
00241     size_t collected = 0;
00242     for (size_t i = 0; collected < n_samples && i < samples.size(); ++i) {
00243       if (samples[i].getDiff() < sample_threshold) continue;
00244       ++collected;
00245       size_t file_id1 = samples[i].getTranslation1().first;
00246       size_t hypo_id1 = samples[i].getTranslation1().second;
00247       size_t file_id2 = samples[i].getTranslation2().first;
00248       size_t hypo_id2 = samples[i].getTranslation2().second;
00249       *out << "1";
00250       outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1),
00251                    featureDataIters[file_id2]->operator[](hypo_id2));
00252       *out << endl;
00253       *out << "0";
00254       outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2),
00255                    featureDataIters[file_id1]->operator[](hypo_id1));
00256       *out << endl;
00257     }
00258     //advance all iterators
00259     for (size_t i = 0; i < featureFiles.size(); ++i) {
00260       ++featureDataIters[i];
00261       ++scoreDataIters[i];
00262     }
00263     ++sentenceId;
00264   }
00265 
00266   outFile.close();
00267 
00268 }