00001 #ifndef LM_FILTER_ARPA_IO_H
00002 #define LM_FILTER_ARPA_IO_H
00003 
00004 
00005 #include "lm/read_arpa.hh"
00006 #include "util/exception.hh"
00007 #include "util/file_stream.hh"
00008 #include "util/string_piece.hh"
00009 #include "util/tokenize_piece.hh"
00010 
00011 #include <boost/noncopyable.hpp>
00012 #include <boost/scoped_array.hpp>
00013 
00014 #include <fstream>
00015 #include <string>
00016 #include <vector>
00017 
00018 #include <cstring>
00019 #include <stdint.h>
00020 
00021 namespace util { class FilePiece; }
00022 
00023 namespace lm {
00024 
00025 class ARPAInputException : public util::Exception {
00026   public:
00027     explicit ARPAInputException(const StringPiece &message) throw();
00028     explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
00029     virtual ~ARPAInputException() throw();
00030 };
00031 
00032 
00033 size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
00034 
00035 
00036 
00037 
00038 
00039 class ARPAOutput : boost::noncopyable {
00040   public:
00041     explicit ARPAOutput(const char *name, size_t buffer_size = 65536);
00042 
00043     void ReserveForCounts(std::streampos reserve);
00044 
00045     void BeginLength(unsigned int length);
00046 
00047     void AddNGram(const StringPiece &line) {
00048       file_ << line << '\n';
00049       ++fast_counter_;
00050     }
00051 
00052     void AddNGram(const StringPiece &ngram, const StringPiece &line) {
00053       AddNGram(line);
00054     }
00055 
00056     template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
00057       AddNGram(line);
00058     }
00059 
00060     void EndLength(unsigned int length);
00061 
00062     void Finish();
00063 
00064   private:
00065     util::scoped_fd file_backing_;
00066     util::FileStream file_;
00067     size_t fast_counter_;
00068     std::vector<uint64_t> counts_;
00069 };
00070 
00071 
00072 template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
00073   ReadNGramHeader(in, length);
00074   out.BeginLength(length);
00075   for (uint64_t i = 0; i < number; ++i) {
00076     StringPiece line = in.ReadLine();
00077     util::TokenIter<util::SingleCharacter> tabber(line, '\t');
00078     if (!tabber) throw ARPAInputException("blank line", line);
00079     if (!++tabber) throw ARPAInputException("no tab", line);
00080 
00081     out.AddNGram(*tabber, line);
00082   }
00083   out.EndLength(length);
00084 }
00085 
00086 template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
00087   std::vector<uint64_t> number;
00088   ReadARPACounts(in_lm, number);
00089   out.ReserveForCounts(SizeNeededForCounts(number));
00090   for (unsigned int i = 0; i < number.size(); ++i) {
00091     ReadNGrams(in_lm, i + 1, number[i], out);
00092   }
00093   ReadEnd(in_lm);
00094   out.Finish();
00095 }
00096 
00097 } 
00098 
00099 #endif // LM_FILTER_ARPA_IO_H