00001 #include <iostream>
00002
00003 #ifdef WITH_THREADS
00004 #include <boost/thread/thread.hpp>
00005 #endif
00006
00007 #include "moses/TypeDef.h"
00008 #include "moses/TranslationModel/CompactPT/PhraseTableCreator.h"
00009
00010 #include "util/file.hh"
00011
00012 using namespace Moses;
00013
00014 void printHelp(char **argv)
00015 {
00016 std::cerr << "Usage " << argv[0] << ":\n"
00017 " options: \n"
00018 "\t-in string -- input table file name\n"
00019 "\t-out string -- prefix of binary table file\n"
00020 "\t-T string -- path to temporary directory (uses /tmp by default)\n"
00021 "\t-nscores int -- number of score components in phrase table\n"
00022 "\t-no-alignment-info -- do not include alignment info in the binary phrase table\n"
00023 #ifdef WITH_THREADS
00024 "\t-threads int|all -- number of threads used for conversion\n"
00025 #endif
00026 "\n advanced:\n"
00027 "\t-encoding string -- encoding type: PREnc REnc None (default PREnc)\n"
00028 "\t-rankscore int -- score index of P(t|s) (default 2)\n"
00029 "\t-maxrank int -- maximum rank for PREnc (default 100)\n"
00030 "\t-landmark int -- use landmark phrase every 2^n source phrases (default 10)\n"
00031 "\t-fingerprint int -- number of bits used for source phrase fingerprints (default 16)\n"
00032 "\t-join-scores -- single set of Huffman codes for score components\n"
00033 "\t-quantize int -- maximum number of scores per score component\n"
00034 "\t-no-warnings -- suppress warnings about missing alignment data\n"
00035 "\n"
00036 " For more information see: http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc6\n\n"
00037 " If you use this please cite:\n\n"
00038 " @article { junczys_pbml98_2012,\n"
00039 " author = { Marcin Junczys-Dowmunt },\n"
00040 " title = { Phrasal Rank-Encoding: Exploiting Phrase Redundancy and\n"
00041 " Translational Relations for Phrase Table Compression },\n"
00042 " journal = { The Prague Bulletin of Mathematical Linguistics },\n"
00043 " volume = { 98 },\n"
00044 " year = { 2012 },\n"
00045 " note = { Proceedings of the MT Marathon 2012, Edinburgh },\n"
00046 " }\n\n"
00047 " Acknowledgments: Part of this research was carried out at and funded by\n"
00048 " the World Intellectual Property Organization (WIPO) in Geneva.\n\n";
00049 }
00050
00051
00052 int main(int argc, char **argv)
00053 {
00054
00055 std::string inFilePath;
00056 std::string outFilePath("out");
00057 std::string tempfilePath;
00058 PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc;
00059
00060 size_t numScoreComponent = 4;
00061 size_t orderBits = 10;
00062 size_t fingerprintBits = 16;
00063 bool useAlignmentInfo = true;
00064 bool multipleScoreTrees = true;
00065 size_t quantize = 0;
00066 size_t maxRank = 100;
00067 bool sortScoreIndexSet = false;
00068 size_t sortScoreIndex = 2;
00069 bool warnMe = true;
00070 size_t threads =
00071 #ifdef WITH_THREADS
00072 boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() :
00073 #endif
00074 1;
00075
00076 if(1 >= argc) {
00077 printHelp(argv);
00078 return 1;
00079 }
00080 for(int i = 1; i < argc; ++i) {
00081 std::string arg(argv[i]);
00082 if("-in" == arg && i+1 < argc) {
00083 ++i;
00084 inFilePath = argv[i];
00085 } else if("-out" == arg && i+1 < argc) {
00086 ++i;
00087 outFilePath = argv[i];
00088 } else if("-T" == arg && i+1 < argc) {
00089 ++i;
00090 tempfilePath = argv[i];
00091 util::NormalizeTempPrefix(tempfilePath);
00092 } else if("-encoding" == arg && i+1 < argc) {
00093 ++i;
00094 std::string val(argv[i]);
00095 if(val == "None" || val == "none") {
00096 coding = PhraseTableCreator::None;
00097 } else if(val == "REnc" || val == "renc") {
00098 coding = PhraseTableCreator::REnc;
00099 } else if(val == "PREnc" || val == "prenc") {
00100 coding = PhraseTableCreator::PREnc;
00101 }
00102 } else if("-maxrank" == arg && i+1 < argc) {
00103 ++i;
00104 maxRank = atoi(argv[i]);
00105 } else if("-nscores" == arg && i+1 < argc) {
00106 ++i;
00107 numScoreComponent = atoi(argv[i]);
00108 } else if("-rankscore" == arg && i+1 < argc) {
00109 ++i;
00110 sortScoreIndex = atoi(argv[i]);
00111 sortScoreIndexSet = true;
00112 } else if("-no-alignment-info" == arg) {
00113 useAlignmentInfo = false;
00114 } else if("-landmark" == arg && i+1 < argc) {
00115 ++i;
00116 orderBits = atoi(argv[i]);
00117 } else if("-fingerprint" == arg && i+1 < argc) {
00118 ++i;
00119 fingerprintBits = atoi(argv[i]);
00120 } else if("-join-scores" == arg) {
00121 multipleScoreTrees = false;
00122 } else if("-quantize" == arg && i+1 < argc) {
00123 ++i;
00124 quantize = atoi(argv[i]);
00125 } else if("-no-warnings" == arg) {
00126 warnMe = false;
00127 } else if("-threads" == arg && i+1 < argc) {
00128 #ifdef WITH_THREADS
00129 ++i;
00130 if(std::string(argv[i]) == "all") {
00131 threads = boost::thread::hardware_concurrency();
00132 if(!threads) {
00133 std::cerr << "Could not determine number of hardware threads, setting to 1" << std::endl;
00134 threads = 1;
00135 }
00136 } else
00137 threads = atoi(argv[i]);
00138 #else
00139 std::cerr << "Thread support not compiled in" << std::endl;
00140 exit(1);
00141 #endif
00142 } else {
00143
00144 printHelp(argv);
00145 return 1;
00146 }
00147 }
00148
00149 if(!sortScoreIndexSet && numScoreComponent != 4 && coding == PhraseTableCreator::PREnc) {
00150 std::cerr << "WARNING: You are using a nonstandard number of scores ("
00151 << numScoreComponent << ") with PREnc. Set the index of P(t|s) "
00152 "with -rankscore int if it is not "
00153 << sortScoreIndex << "." << std::endl;
00154 }
00155
00156 if(sortScoreIndex >= numScoreComponent) {
00157 std::cerr << "ERROR: -rankscore " << sortScoreIndex << " is out of range (0 ... "
00158 << (numScoreComponent-1) << ")" << std::endl;
00159 abort();
00160 }
00161
00162 if(outFilePath.rfind(".minphr") != outFilePath.size() - 7)
00163 outFilePath += ".minphr";
00164
00165 PhraseTableCreator(inFilePath, outFilePath, tempfilePath,
00166 numScoreComponent, sortScoreIndex,
00167 coding, orderBits, fingerprintBits,
00168 useAlignmentInfo, multipleScoreTrees,
00169 quantize, maxRank, warnMe
00170 #ifdef WITH_THREADS
00171 , threads
00172 #endif
00173 );
00174 }