00001 #ifndef LM_READ_ARPA_H
00002 #define LM_READ_ARPA_H
00003
00004 #include "lm/lm_exception.hh"
00005 #include "lm/word_index.hh"
00006 #include "lm/weights.hh"
00007 #include "util/file_piece.hh"
00008
00009 #include <cstddef>
00010 #include <iosfwd>
00011 #include <vector>
00012
00013 namespace lm {
00014
00015 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
00016 void ReadNGramHeader(util::FilePiece &in, unsigned int length);
00017
00018 void ReadBackoff(util::FilePiece &in, Prob &weights);
00019 void ReadBackoff(util::FilePiece &in, float &backoff);
00020 inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
00021 ReadBackoff(in, weights.backoff);
00022 }
00023 inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) {
00024 ReadBackoff(in, weights.backoff);
00025 }
00026
00027 void ReadEnd(util::FilePiece &in);
00028
00029 extern const bool kARPASpaces[256];
00030
00031
00032 class PositiveProbWarn {
00033 public:
00034 PositiveProbWarn() : action_(THROW_UP) {}
00035
00036 explicit PositiveProbWarn(WarningAction action) : action_(action) {}
00037
00038 void Warn(float prob);
00039
00040 private:
00041 WarningAction action_;
00042 };
00043
00044 template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
00045 try {
00046 float prob = f.ReadFloat();
00047 if (prob > 0.0) {
00048 warn.Warn(prob);
00049 prob = 0.0;
00050 }
00051 UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
00052 WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
00053 Weights &w = unigrams[word];
00054 w.prob = prob;
00055 ReadBackoff(f, w);
00056 } catch(util::Exception &e) {
00057 e << " in the 1-gram at byte " << f.Offset();
00058 throw;
00059 }
00060 }
00061
00062 template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
00063 ReadNGramHeader(f, 1);
00064 for (std::size_t i = 0; i < count; ++i) {
00065 Read1Gram(f, vocab, unigrams, warn);
00066 }
00067 vocab.FinishedLoading(unigrams);
00068 }
00069
00070
00071 template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) {
00072 try {
00073 weights.prob = f.ReadFloat();
00074 if (weights.prob > 0.0) {
00075 warn.Warn(weights.prob);
00076 weights.prob = 0.0;
00077 }
00078 for (unsigned char i = 0; i < n; ++i, ++indices_out) {
00079 StringPiece word(f.ReadDelimited(kARPASpaces));
00080 WordIndex index = vocab.Index(word);
00081 *indices_out = index;
00082
00083 UTIL_THROW_IF(index == 0 && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
00084 FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
00085 }
00086 ReadBackoff(f, weights);
00087 } catch(util::Exception &e) {
00088 e << " in the " << static_cast<unsigned int>(n) << "-gram at byte " << f.Offset();
00089 throw;
00090 }
00091 }
00092
00093 }
00094
00095 #endif // LM_READ_ARPA_H