00001 #include "lm/common/print.hh"
00002
00003 #include "lm/common/ngram_stream.hh"
00004 #include "util/file_stream.hh"
00005 #include "util/file.hh"
00006 #include "util/mmap.hh"
00007 #include "util/scoped.hh"
00008
00009 #include <sstream>
00010 #include <cstring>
00011
00012 namespace lm {
00013
00014 VocabReconstitute::VocabReconstitute(int fd) {
00015 uint64_t size = util::SizeOrThrow(fd);
00016 util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
00017 const char *const start = static_cast<const char*>(memory_.get());
00018 const char *i;
00019 for (i = start; i != start + size; i += strlen(i) + 1) {
00020 map_.push_back(i);
00021 }
00022
00023 map_.push_back(i);
00024 }
00025
00026 namespace {
00027 template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
00028 out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
00029 for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
00030 out << ' ' << vocab.Lookup(*i);
00031 }
00032 }
00033 }
00034
00035 void PrintARPA::Run(const util::stream::ChainPositions &positions) {
00036 VocabReconstitute vocab(vocab_fd_);
00037 util::FileStream out(out_fd_);
00038 out << "\\data\\\n";
00039 for (size_t i = 0; i < positions.size(); ++i) {
00040 out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
00041 }
00042 out << '\n';
00043
00044 for (unsigned order = 1; order < positions.size(); ++order) {
00045 out << "\\" << order << "-grams:" << '\n';
00046 for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
00047 PrintLead(vocab, stream, out);
00048 out << '\t' << stream->Value().backoff << '\n';
00049 }
00050 out << '\n';
00051 }
00052
00053 out << "\\" << positions.size() << "-grams:" << '\n';
00054 for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
00055 PrintLead(vocab, stream, out);
00056 out << '\n';
00057 }
00058 out << '\n';
00059 out << "\\end\\\n";
00060 }
00061
00062 }