00001 #include "lm/common/print.hh"
00002 
00003 #include "lm/common/ngram_stream.hh"
00004 #include "util/file_stream.hh"
00005 #include "util/file.hh"
00006 #include "util/mmap.hh"
00007 #include "util/scoped.hh"
00008 
00009 #include <sstream>
00010 #include <cstring>
00011 
00012 namespace lm {
00013 
00014 VocabReconstitute::VocabReconstitute(int fd) {
00015   uint64_t size = util::SizeOrThrow(fd);
00016   util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
00017   const char *const start = static_cast<const char*>(memory_.get());
00018   const char *i;
00019   for (i = start; i != start + size; i += strlen(i) + 1) {
00020     map_.push_back(i);
00021   }
00022   
00023   map_.push_back(i);
00024 }
00025 
00026 namespace {
00027 template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
00028   out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
00029   for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
00030     out << ' ' << vocab.Lookup(*i);
00031   }
00032 }
00033 } 
00034 
00035 void PrintARPA::Run(const util::stream::ChainPositions &positions) {
00036   VocabReconstitute vocab(vocab_fd_);
00037   util::FileStream out(out_fd_);
00038   out << "\\data\\\n";
00039   for (size_t i = 0; i < positions.size(); ++i) {
00040     out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
00041   }
00042   out << '\n';
00043 
00044   for (unsigned order = 1; order < positions.size(); ++order) {
00045     out << "\\" << order << "-grams:" << '\n';
00046     for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
00047       PrintLead(vocab, stream, out);
00048       out << '\t' << stream->Value().backoff << '\n';
00049     }
00050     out << '\n';
00051   }
00052 
00053   out << "\\" << positions.size() << "-grams:" << '\n';
00054   for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
00055     PrintLead(vocab, stream, out);
00056     out << '\n';
00057   }
00058   out << '\n';
00059   out << "\\end\\\n";
00060 }
00061 
00062 }