00001 #ifndef LM_FILTER_COUNT_IO_H
00002 #define LM_FILTER_COUNT_IO_H
00003
00004 #include <fstream>
00005 #include <iostream>
00006 #include <string>
00007
00008 #include "util/file_stream.hh"
00009 #include "util/file.hh"
00010 #include "util/file_piece.hh"
00011
00012 namespace lm {
00013
00014 class CountOutput : boost::noncopyable {
00015 public:
00016 explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}
00017
00018 void AddNGram(const StringPiece &line) {
00019 file_ << line << '\n';
00020 }
00021
00022 template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
00023 AddNGram(line);
00024 }
00025
00026 void AddNGram(const StringPiece &ngram, const StringPiece &line) {
00027 AddNGram(line);
00028 }
00029
00030 private:
00031 util::FileStream file_;
00032 };
00033
00034 class CountBatch {
00035 public:
00036 explicit CountBatch(std::streamsize initial_read)
00037 : initial_read_(initial_read) {
00038 buffer_.reserve(initial_read);
00039 }
00040
00041 void Read(std::istream &in) {
00042 buffer_.resize(initial_read_);
00043 in.read(&*buffer_.begin(), initial_read_);
00044 buffer_.resize(in.gcount());
00045 char got;
00046 while (in.get(got) && got != '\n')
00047 buffer_.push_back(got);
00048 }
00049
00050 template <class Output> void Send(Output &out) {
00051 for (util::TokenIter<util::SingleCharacter> line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) {
00052 util::TokenIter<util::SingleCharacter> tabber(*line, '\t');
00053 if (!tabber) {
00054 std::cerr << "Warning: empty n-gram count line being removed\n";
00055 continue;
00056 }
00057 util::TokenIter<util::SingleCharacter, true> words(*tabber, ' ');
00058 if (!words) {
00059 std::cerr << "Line has a tab but no words.\n";
00060 continue;
00061 }
00062 out.AddNGram(words, util::TokenIter<util::SingleCharacter, true>::end(), *line);
00063 }
00064 }
00065
00066 private:
00067 std::streamsize initial_read_;
00068
00069
00070 std::vector<char> buffer_;
00071 };
00072
00073 template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) {
00074 try {
00075 while (true) {
00076 StringPiece line = in_file.ReadLine();
00077 util::TokenIter<util::SingleCharacter> tabber(line, '\t');
00078 if (!tabber) {
00079 std::cerr << "Warning: empty n-gram count line being removed\n";
00080 continue;
00081 }
00082 out.AddNGram(*tabber, line);
00083 }
00084 } catch (const util::EndOfFileException &e) {}
00085 }
00086
00087 }
00088
00089 #endif // LM_FILTER_COUNT_IO_H