00001 #include "lm/model.hh"
00002 #include "lm/sizes.hh"
00003 #include "util/file_piece.hh"
00004 #include "util/usage.hh"
00005
00006 #include <algorithm>
00007 #include <cstdlib>
00008 #include <exception>
00009 #include <iostream>
00010 #include <iomanip>
00011 #include <limits>
00012 #include <cmath>
00013 #include <cstdlib>
00014
00015 #ifdef WIN32
00016 #include "util/getopt.hh"
00017 #else
00018 #include <unistd.h>
00019 #endif
00020
00021 namespace lm {
00022 namespace ngram {
00023 namespace {
00024
00025 void Usage(const char *name, const char *default_mem) {
00026 std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
00027 "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
00028 " Default is -100. The ARPA file will always take precedence.\n"
00029 "-s allows models to be built even if they do not have <s> and </s>.\n"
00030 "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
00031 "-w mmap|after determines how writing is done.\n"
00032 " mmap maps the binary file and writes to it. Default for trie.\n"
00033 " after allocates anonymous memory, builds, and writes. Default for probing.\n"
00034 "-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
00035 " model files. order1.arpa must be an ARPA file. All others may be ARPA or\n"
00036 " the same data structure as being built. All files must have the same\n"
00037 " vocabulary. For probing, the unigrams must be in the same order.\n\n"
00038 "type is either probing or trie. Default is probing.\n\n"
00039 "probing uses a probing hash table. It is the fastest but uses the most memory.\n"
00040 "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
00041 "trie is a straightforward trie with bit-level packing. It uses the least\n"
00042 "memory and is still faster than SRI or IRST. Building the trie format uses an\n"
00043 "on-disk sort to save memory.\n"
00044 "-T is the temporary directory prefix. Default is the output file name.\n"
00045 "-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n"
00046 " with GNU sort. The number is followed by a unit: \% for percent of physical\n"
00047 " memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n"
00048 " Default unit is K for Kilobytes.\n"
00049 "-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
00050 "-b sets backoff quantization bits. Requires -q and defaults to that value.\n"
00051 "-a compresses pointers using an array of offsets. The parameter is the\n"
00052 " maximum number of bits encoded by the array. Memory is minimized subject\n"
00053 " to the maximum, so pick 255 to minimize memory.\n\n"
00054 "-h print this help message.\n\n"
00055 "Get a memory estimate by passing an ARPA file without an output file name.\n";
00056 exit(1);
00057 }
00058
00059
00060 float ParseFloat(const char *from) {
00061 char *end;
00062 float ret = strtod(from, &end);
00063 if (*end) throw util::ParseNumberException(from);
00064 return ret;
00065 }
00066 unsigned long int ParseUInt(const char *from) {
00067 char *end;
00068 unsigned long int ret = strtoul(from, &end, 10);
00069 if (*end) throw util::ParseNumberException(from);
00070 return ret;
00071 }
00072
00073 uint8_t ParseBitCount(const char *from) {
00074 unsigned long val = ParseUInt(from);
00075 if (val > 25) {
00076 util::ParseNumberException e(from);
00077 e << " bit counts are limited to 25.";
00078 }
00079 return val;
00080 }
00081
00082 void ParseFileList(const char *from, std::vector<std::string> &to) {
00083 to.clear();
00084 while (true) {
00085 const char *i;
00086 for (i = from; *i && *i != ' '; ++i) {}
00087 to.push_back(std::string(from, i - from));
00088 if (!*i) break;
00089 from = i + 1;
00090 }
00091 }
00092
00093 void ProbingQuantizationUnsupported() {
00094 std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
00095 exit(1);
00096 }
00097
00098 }
00099 }
00100 }
00101
00102 int main(int argc, char *argv[]) {
00103 using namespace lm::ngram;
00104
00105 const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
00106
00107 if (argc == 2 && !strcmp(argv[1], "--help"))
00108 Usage(argv[0], default_mem);
00109
00110 try {
00111 bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
00112 lm::ngram::Config config;
00113 config.building_memory = util::ParseSize(default_mem);
00114 int opt;
00115 while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) {
00116 switch(opt) {
00117 case 'q':
00118 config.prob_bits = ParseBitCount(optarg);
00119 if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
00120 quantize = true;
00121 break;
00122 case 'b':
00123 config.backoff_bits = ParseBitCount(optarg);
00124 set_backoff_bits = true;
00125 break;
00126 case 'a':
00127 config.pointer_bhiksha_bits = ParseBitCount(optarg);
00128 bhiksha = true;
00129 break;
00130 case 'u':
00131 config.unknown_missing_logprob = ParseFloat(optarg);
00132 break;
00133 case 'p':
00134 config.probing_multiplier = ParseFloat(optarg);
00135 break;
00136 case 't':
00137 case 'T':
00138 config.temporary_directory_prefix = optarg;
00139 util::NormalizeTempPrefix(config.temporary_directory_prefix);
00140 break;
00141 case 'm':
00142 config.building_memory = ParseUInt(optarg) * 1048576;
00143 break;
00144 case 'S':
00145 config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
00146 break;
00147 case 'w':
00148 set_write_method = true;
00149 if (!strcmp(optarg, "mmap")) {
00150 config.write_method = Config::WRITE_MMAP;
00151 } else if (!strcmp(optarg, "after")) {
00152 config.write_method = Config::WRITE_AFTER;
00153 } else {
00154 Usage(argv[0], default_mem);
00155 }
00156 break;
00157 case 's':
00158 config.sentence_marker_missing = lm::SILENT;
00159 break;
00160 case 'i':
00161 config.positive_log_probability = lm::SILENT;
00162 break;
00163 case 'r':
00164 rest = true;
00165 ParseFileList(optarg, config.rest_lower_files);
00166 config.rest_function = Config::REST_LOWER;
00167 break;
00168 case 'h':
00169 default:
00170 Usage(argv[0], default_mem);
00171 }
00172 }
00173 if (!quantize && set_backoff_bits) {
00174 std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
00175 abort();
00176 }
00177 if (optind + 1 == argc) {
00178 ShowSizes(argv[optind], config);
00179 return 0;
00180 }
00181 const char *model_type;
00182 const char *from_file;
00183
00184 if (optind + 2 == argc) {
00185 model_type = "probing";
00186 from_file = argv[optind];
00187 config.write_mmap = argv[optind + 1];
00188 } else if (optind + 3 == argc) {
00189 model_type = argv[optind];
00190 from_file = argv[optind + 1];
00191 config.write_mmap = argv[optind + 2];
00192 } else {
00193 Usage(argv[0], default_mem);
00194 return 1;
00195 }
00196 if (!strcmp(model_type, "probing")) {
00197 if (!set_write_method) config.write_method = Config::WRITE_AFTER;
00198 if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
00199 if (rest) {
00200 RestProbingModel(from_file, config);
00201 } else {
00202 ProbingModel(from_file, config);
00203 }
00204 } else if (!strcmp(model_type, "trie")) {
00205 if (rest) {
00206 std::cerr << "Rest + trie is not supported yet." << std::endl;
00207 return 1;
00208 }
00209 if (!set_write_method) config.write_method = Config::WRITE_MMAP;
00210 if (quantize) {
00211 if (bhiksha) {
00212 QuantArrayTrieModel(from_file, config);
00213 } else {
00214 QuantTrieModel(from_file, config);
00215 }
00216 } else {
00217 if (bhiksha) {
00218 ArrayTrieModel(from_file, config);
00219 } else {
00220 TrieModel(from_file, config);
00221 }
00222 }
00223 } else {
00224 Usage(argv[0], default_mem);
00225 }
00226 }
00227 catch (const std::exception &e) {
00228 std::cerr << e.what() << std::endl;
00229 std::cerr << "ERROR" << std::endl;
00230 return 1;
00231 }
00232 std::cerr << "SUCCESS" << std::endl;
00233 return 0;
00234 }