00001 #ifndef LM_FILTER_ARPA_IO_H
00002 #define LM_FILTER_ARPA_IO_H
00003
00004
00005 #include "lm/read_arpa.hh"
00006 #include "util/exception.hh"
00007 #include "util/file_stream.hh"
00008 #include "util/string_piece.hh"
00009 #include "util/tokenize_piece.hh"
00010
00011 #include <boost/noncopyable.hpp>
00012 #include <boost/scoped_array.hpp>
00013
00014 #include <fstream>
00015 #include <string>
00016 #include <vector>
00017
00018 #include <cstring>
00019 #include <stdint.h>
00020
00021 namespace util { class FilePiece; }
00022
00023 namespace lm {
00024
00025 class ARPAInputException : public util::Exception {
00026 public:
00027 explicit ARPAInputException(const StringPiece &message) throw();
00028 explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
00029 virtual ~ARPAInputException() throw();
00030 };
00031
00032
00033 size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
00034
00035
00036
00037
00038
00039 class ARPAOutput : boost::noncopyable {
00040 public:
00041 explicit ARPAOutput(const char *name, size_t buffer_size = 65536);
00042
00043 void ReserveForCounts(std::streampos reserve);
00044
00045 void BeginLength(unsigned int length);
00046
00047 void AddNGram(const StringPiece &line) {
00048 file_ << line << '\n';
00049 ++fast_counter_;
00050 }
00051
00052 void AddNGram(const StringPiece &ngram, const StringPiece &line) {
00053 AddNGram(line);
00054 }
00055
00056 template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
00057 AddNGram(line);
00058 }
00059
00060 void EndLength(unsigned int length);
00061
00062 void Finish();
00063
00064 private:
00065 util::scoped_fd file_backing_;
00066 util::FileStream file_;
00067 size_t fast_counter_;
00068 std::vector<uint64_t> counts_;
00069 };
00070
00071
00072 template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
00073 ReadNGramHeader(in, length);
00074 out.BeginLength(length);
00075 for (uint64_t i = 0; i < number; ++i) {
00076 StringPiece line = in.ReadLine();
00077 util::TokenIter<util::SingleCharacter> tabber(line, '\t');
00078 if (!tabber) throw ARPAInputException("blank line", line);
00079 if (!++tabber) throw ARPAInputException("no tab", line);
00080
00081 out.AddNGram(*tabber, line);
00082 }
00083 out.EndLength(length);
00084 }
00085
00086 template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
00087 std::vector<uint64_t> number;
00088 ReadARPACounts(in_lm, number);
00089 out.ReserveForCounts(SizeNeededForCounts(number));
00090 for (unsigned int i = 0; i < number.size(); ++i) {
00091 ReadNGrams(in_lm, i + 1, number[i], out);
00092 }
00093 ReadEnd(in_lm);
00094 out.Finish();
00095 }
00096
00097 }
00098
00099 #endif // LM_FILTER_ARPA_IO_H