#include <corpus_count.hh>
Public Member Functions | |
CorpusCount (util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector< bool > &prune_words, const std::string &prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol) | |
void | Run (const util::stream::ChainPosition &position) |
Static Public Member Functions | |
static float | DedupeMultiplier (std::size_t order) |
static std::size_t | VocabUsage (std::size_t vocab_estimate) |
Definition at line 23 of file corpus_count.hh.
lm::builder::CorpusCount::CorpusCount | ( | util::FilePiece & | from, | |
int | vocab_write, | |||
uint64_t & | token_count, | |||
WordIndex & | type_count, | |||
std::vector< bool > & | prune_words, | |||
const std::string & | prune_vocab_filename, | |||
std::size_t | entries_per_block, | |||
WarningAction | disallowed_symbol | |||
) |
Definition at line 162 of file corpus_count.cc.
float lm::builder::CorpusCount::DedupeMultiplier | ( | std::size_t | order | ) | [static] |
Definition at line 154 of file corpus_count.cc.
void lm::builder::CorpusCount::Run | ( | const util::stream::ChainPosition & | position | ) |
Definition at line 185 of file corpus_count.cc.
References util::BoolCharacter::Build(), count, lm::ngram::GrowableVocab< NewWordAction >::FindOrInsert(), util::scoped_base< T, Closer >::get(), util::stream::ChainPosition::GetChain(), lm::ngram::GrowableVocab< NewWordAction >::Index(), lm::builder::kBOS, lm::builder::kEOS, lm::kUNK, util::FilePiece::ReadLine(), lm::ngram::GrowableVocab< NewWordAction >::Size(), and util::Exception::what().
std::size_t lm::builder::CorpusCount::VocabUsage | ( | std::size_t | vocab_estimate | ) | [static] |
Definition at line 158 of file corpus_count.cc.