00001
00007 #include <iostream>
00008 #include <string>
00009 #include <vector>
00010
00011 #include <getopt.h>
00012 #include <boost/scoped_ptr.hpp>
00013
00014 #include "Data.h"
00015 #include "Scorer.h"
00016 #include "ScorerFactory.h"
00017 #include "Timer.h"
00018 #include "Util.h"
00019
00020 using namespace std;
00021 using namespace MosesTuning;
00022
00023 namespace
00024 {
00025
00026 void usage()
00027 {
00028 cerr << "usage: extractor [options])" << endl;
00029 cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
00030 cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
00031 cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
00032 cerr << "[--reference|-r] comma separated list of reference files" << endl;
00033 cerr << "[--binary|-b] use binary output format (default to text )" << endl;
00034 cerr << "[--nbest|-n] the nbest file" << endl;
00035 cerr << "[--scfile|-S] the scorer data output file" << endl;
00036 cerr << "[--ffile|-F] the feature data output file" << endl;
00037 cerr << "[--prev-ffile|-E] comma separated list of previous feature data" << endl;
00038 cerr << "[--prev-scfile|-R] comma separated list of previous scorer data" << endl;
00039 cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
00040 cerr << "[--filter|-l] filter command used to preprocess the sentences" << endl;
00041 cerr << "[--allow-duplicates|-d] omit the duplicate removal step" << endl;
00042 cerr << "[-v] verbose level" << endl;
00043 cerr << "[--help|-h] print this message and exit" << endl;
00044 exit(1);
00045 }
00046
00047 static struct option long_options[] = {
00048 {"sctype", required_argument, 0, 's'},
00049 {"scconfig", required_argument,0, 'c'},
00050 {"factors", required_argument,0, 'f'},
00051 {"filter", required_argument,0, 'l'},
00052 {"reference", required_argument, 0, 'r'},
00053 {"binary", no_argument, 0, 'b'},
00054 {"nbest", required_argument, 0, 'n'},
00055 {"scfile", required_argument, 0, 'S'},
00056 {"ffile", required_argument, 0, 'F'},
00057 {"prev-scfile", required_argument, 0, 'R'},
00058 {"prev-ffile", required_argument, 0, 'E'},
00059 {"verbose", required_argument, 0, 'v'},
00060 {"help", no_argument, 0, 'h'},
00061 {"allow-duplicates", no_argument, 0, 'd'},
00062 {0, 0, 0, 0}
00063 };
00064
00065
00066 struct ProgramOption {
00067 string scorerType;
00068 string scorerConfig;
00069 string scorerFactors;
00070 string scorerFilter;
00071 string referenceFile;
00072 string nbestFile;
00073 string scoreDataFile;
00074 string featureDataFile;
00075 string prevScoreDataFile;
00076 string prevFeatureDataFile;
00077 bool binmode;
00078 bool allowDuplicates;
00079 int verbosity;
00080
00081 ProgramOption()
00082 : scorerType("BLEU"),
00083 scorerConfig(""),
00084 scorerFactors(""),
00085 scorerFilter(""),
00086 referenceFile(""),
00087 nbestFile(""),
00088 scoreDataFile("statscore.data"),
00089 featureDataFile("features.data"),
00090 prevScoreDataFile(""),
00091 prevFeatureDataFile(""),
00092 binmode(false),
00093 allowDuplicates(false),
00094 verbosity(0) { }
00095 };
00096
00097 void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
00098 {
00099 int c;
00100 int option_index;
00101
00102 while ((c = getopt_long(argc, argv, "s:r:f:l:n:S:F:R:E:v:hbd", long_options, &option_index)) != -1) {
00103 switch (c) {
00104 case 's':
00105 opt->scorerType = string(optarg);
00106 break;
00107 case 'c':
00108 opt->scorerConfig = string(optarg);
00109 break;
00110 case 'f':
00111 opt->scorerFactors = string(optarg);
00112 break;
00113 case 'l':
00114 opt->scorerFilter = string(optarg);
00115 break;
00116 case 'r':
00117 opt->referenceFile = string(optarg);
00118 break;
00119 case 'b':
00120 opt->binmode = true;
00121 break;
00122 case 'n':
00123 opt->nbestFile = string(optarg);
00124 break;
00125 case 'S':
00126 opt->scoreDataFile = string(optarg);
00127 break;
00128 case 'F':
00129 opt->featureDataFile = string(optarg);
00130 break;
00131 case 'E':
00132 opt->prevFeatureDataFile = string(optarg);
00133 break;
00134 case 'R':
00135 opt->prevScoreDataFile = string(optarg);
00136 break;
00137 case 'v':
00138 opt->verbosity = atoi(optarg);
00139 break;
00140 case 'd':
00141 opt->allowDuplicates = true;
00142 break;
00143 default:
00144 usage();
00145 }
00146 }
00147 }
00148
00149 }
00150
00151 int main(int argc, char** argv)
00152 {
00153 ResetUserTime();
00154
00155 ProgramOption option;
00156 ParseCommandOptions(argc, argv, &option);
00157
00158 try {
00159
00160 if (option.scoreDataFile.length() == 0) {
00161 throw runtime_error("Error: output score statistics file is not specified");
00162 }
00163
00164
00165 if (option.featureDataFile.length() == 0) {
00166 throw runtime_error("Error: output feature file is not specified");
00167 }
00168
00169
00170 if ((option.nbestFile.length() > 0 && option.referenceFile.length() == 0)) {
00171 throw runtime_error("Error: reference file is not specified; you can not score the nbest");
00172 }
00173
00174 vector<string> nbestFiles;
00175 if (option.nbestFile.length() > 0) {
00176 Tokenize(option.nbestFile.c_str(), ',', &nbestFiles);
00177 }
00178
00179 vector<string> referenceFiles;
00180 if (option.referenceFile.length() > 0) {
00181 Tokenize(option.referenceFile.c_str(), ',', &referenceFiles);
00182 }
00183
00184 vector<string> prevScoreDataFiles;
00185 if (option.prevScoreDataFile.length() > 0) {
00186 Tokenize(option.prevScoreDataFile.c_str(), ',', &prevScoreDataFiles);
00187 }
00188
00189 vector<string> prevFeatureDataFiles;
00190 if (option.prevFeatureDataFile.length() > 0) {
00191 Tokenize(option.prevFeatureDataFile.c_str(), ',', &prevFeatureDataFiles);
00192 }
00193
00194 if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) {
00195 throw runtime_error("Error: there is a different number of previous score and feature files");
00196 }
00197
00198 if (option.binmode) {
00199 cerr << "Binary write mode is selected" << endl;
00200 } else {
00201 cerr << "Binary write mode is NOT selected" << endl;
00202 }
00203
00204 TRACE_ERR("Scorer type: " << option.scorerType << endl);
00205
00206 boost::scoped_ptr<Scorer> scorer(
00207 ScorerFactory::getScorer(option.scorerType, option.scorerConfig));
00208
00209
00210 scorer->setFactors(option.scorerFactors);
00211 scorer->setFilter(option.scorerFilter);
00212
00213
00214 if (referenceFiles.size() > 0)
00215 scorer->setReferenceFiles(referenceFiles);
00216
00217
00218
00219 Data data(scorer.get());
00220
00221
00222 for (size_t i = 0; i < prevScoreDataFiles.size(); i++) {
00223 data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
00224 }
00225
00226
00227
00228
00229 for (size_t i = 0; i < nbestFiles.size(); i++) {
00230 data.loadNBest(nbestFiles.at(i));
00231 }
00232
00233
00234
00235
00236 if (!option.allowDuplicates) {
00237 data.removeDuplicates();
00238 }
00239
00240
00241 data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
00242 PrintUserTime("Stopping...");
00243
00244 return EXIT_SUCCESS;
00245 } catch (const exception& e) {
00246 cerr << "Exception: " << e.what() << endl;
00247 return EXIT_FAILURE;
00248 }
00249 }