00001 #include "lm/read_arpa.hh"
00002
00003 #include "lm/blank.hh"
00004 #include "util/file.hh"
00005
00006 #include <cmath>
00007 #include <cstdlib>
00008 #include <iostream>
00009 #include <sstream>
00010 #include <vector>
00011
00012 #include <cctype>
00013 #include <cstring>
00014 #include <stdint.h>
00015
00016 #ifdef WIN32
00017 #include <float.h>
00018 #endif
00019
00020 namespace lm {
00021
00022
00023 const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
00024
00025 namespace {
00026
00027 bool IsEntirelyWhiteSpace(const StringPiece &line) {
00028 for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) {
00029 if (!isspace(line.data()[i])) return false;
00030 }
00031 return true;
00032 }
00033
00034 const char kBinaryMagic[] = "mmap lm http://kheafield.com/code";
00035
00036
00037 uint64_t ReadCount(const std::string &from) {
00038 std::stringstream stream(from);
00039 uint64_t ret;
00040 stream >> ret;
00041 UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from);
00042 return ret;
00043 }
00044
00045 }
00046
00047 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
00048 number.clear();
00049 StringPiece line = in.ReadLine();
00050
00051
00052
00053 while (IsEntirelyWhiteSpace(line) || starts_with(line, "#")) {
00054 line = in.ReadLine();
00055 }
00056
00057 if (line != "\\data\\") {
00058 if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
00059 UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
00060 }
00061 if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic)
00062 UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
00063 UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?");
00064 UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst.");
00065 UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\.");
00066 }
00067 while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
00068 if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \"");
00069
00070 std::string remaining(line.data() + 6, line.size() - 6);
00071 char *end_ptr;
00072 unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10);
00073 if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line);
00074 if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line);
00075 ++end_ptr;
00076 number.push_back(ReadCount(end_ptr));
00077 }
00078 }
00079
00080 void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
00081 StringPiece line;
00082 while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
00083 std::stringstream expected;
00084 expected << '\\' << length << "-grams:";
00085 if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead");
00086 }
00087
00088 void ReadBackoff(util::FilePiece &in, Prob &) {
00089 switch (in.get()) {
00090 case '\t':
00091 {
00092 float got = in.ReadFloat();
00093 if (got != 0.0)
00094 UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff");
00095 }
00096 break;
00097 case '\n':
00098 break;
00099 default:
00100 UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
00101 }
00102 }
00103
00104 void ReadBackoff(util::FilePiece &in, float &backoff) {
00105
00106
00107
00108
00109
00110 switch (in.get()) {
00111 case '\t':
00112 backoff = in.ReadFloat();
00113 if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff;
00114 {
00115 #if defined(WIN32) && !defined(__MINGW32__)
00116 int float_class = _fpclass(backoff);
00117 UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff);
00118 #else
00119 int float_class = std::fpclassify(backoff);
00120 UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff);
00121 #endif
00122 }
00123 UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff");
00124 break;
00125 case '\n':
00126 backoff = ngram::kNoExtensionBackoff;
00127 break;
00128 default:
00129 UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
00130 }
00131 }
00132
00133 void ReadEnd(util::FilePiece &in) {
00134 StringPiece line;
00135 do {
00136 line = in.ReadLine();
00137 } while (IsEntirelyWhiteSpace(line));
00138 if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);
00139
00140 try {
00141 while (true) {
00142 line = in.ReadLine();
00143 if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line);
00144 }
00145 } catch (const util::EndOfFileException &e) {}
00146 }
00147
00148 void PositiveProbWarn::Warn(float prob) {
00149 switch (action_) {
00150 case THROW_UP:
00151 UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model. This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability. Error");
00152 case COMPLAIN:
00153 std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapped to 0 log probability." << std::endl;
00154 action_ = SILENT;
00155 break;
00156 case SILENT:
00157 break;
00158 }
00159 }
00160
00161 }