00001 #ifndef LM_BUILDER_CORPUS_COUNT_H
00002 #define LM_BUILDER_CORPUS_COUNT_H
00003
00004 #include "lm/lm_exception.hh"
00005 #include "lm/word_index.hh"
00006 #include "util/scoped.hh"
00007
00008 #include <cstddef>
00009 #include <string>
00010 #include <stdint.h>
00011 #include <vector>
00012
00013 namespace util {
00014 class FilePiece;
00015 namespace stream {
00016 class ChainPosition;
00017 }
00018 }
00019
00020 namespace lm {
00021 namespace builder {
00022
00023 class CorpusCount {
00024 public:
00025
00026 static float DedupeMultiplier(std::size_t order);
00027
00028
00029 static std::size_t VocabUsage(std::size_t vocab_estimate);
00030
00031
00032
00033 CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
00034
00035 void Run(const util::stream::ChainPosition &position);
00036
00037 private:
00038 util::FilePiece &from_;
00039 int vocab_write_;
00040 uint64_t &token_count_;
00041 WordIndex &type_count_;
00042 std::vector<bool>& prune_words_;
00043 const std::string& prune_vocab_filename_;
00044
00045 std::size_t dedupe_mem_size_;
00046 util::scoped_malloc dedupe_mem_;
00047
00048 WarningAction disallowed_symbol_action_;
00049 };
00050
00051 }
00052 }
00053 #endif // LM_BUILDER_CORPUS_COUNT_H